mirror of
https://github.com/mendableai/firecrawl.git
synced 2025-07-30 04:20:47 +00:00
Nick: wip
This commit is contained in:
parent
8a4f4cb9d9
commit
78badf8f72
@ -53,7 +53,7 @@
|
|||||||
"@bull-board/api": "^5.20.5",
|
"@bull-board/api": "^5.20.5",
|
||||||
"@bull-board/express": "^5.20.5",
|
"@bull-board/express": "^5.20.5",
|
||||||
"@devil7softwares/pos": "^1.0.2",
|
"@devil7softwares/pos": "^1.0.2",
|
||||||
"@dqbd/tiktoken": "^1.0.13",
|
"@dqbd/tiktoken": "^1.0.17",
|
||||||
"@hyperdx/node-opentelemetry": "^0.8.1",
|
"@hyperdx/node-opentelemetry": "^0.8.1",
|
||||||
"@logtail/node": "^0.4.12",
|
"@logtail/node": "^0.4.12",
|
||||||
"@nangohq/node": "^0.40.8",
|
"@nangohq/node": "^0.40.8",
|
||||||
@ -73,6 +73,7 @@
|
|||||||
"cacheable-lookup": "^6.1.0",
|
"cacheable-lookup": "^6.1.0",
|
||||||
"cheerio": "^1.0.0-rc.12",
|
"cheerio": "^1.0.0-rc.12",
|
||||||
"cohere": "^1.1.1",
|
"cohere": "^1.1.1",
|
||||||
|
"cohere-ai": "^7.14.0",
|
||||||
"cors": "^2.8.5",
|
"cors": "^2.8.5",
|
||||||
"cron-parser": "^4.9.0",
|
"cron-parser": "^4.9.0",
|
||||||
"date-fns": "^3.6.0",
|
"date-fns": "^3.6.0",
|
||||||
|
1324
apps/api/pnpm-lock.yaml
generated
1324
apps/api/pnpm-lock.yaml
generated
File diff suppressed because it is too large
Load Diff
197
apps/api/src/controllers/v1/extract.ts
Normal file
197
apps/api/src/controllers/v1/extract.ts
Normal file
@ -0,0 +1,197 @@
|
|||||||
|
import { Request, Response } from "express";
|
||||||
|
import { Logger } from "../../lib/logger";
|
||||||
|
import {
|
||||||
|
Document,
|
||||||
|
legacyDocumentConverter,
|
||||||
|
legacyExtractorOptions,
|
||||||
|
legacyScrapeOptions,
|
||||||
|
RequestWithAuth,
|
||||||
|
ExtractRequest,
|
||||||
|
extractRequestSchema,
|
||||||
|
ExtractResponse,
|
||||||
|
legacyCrawlerOptions,
|
||||||
|
MapDocument,
|
||||||
|
} from "./types";
|
||||||
|
import { billTeam } from "../../services/billing/credit_billing";
|
||||||
|
import { v4 as uuidv4 } from "uuid";
|
||||||
|
import { numTokensFromString } from "../../lib/LLM-extraction/helpers";
|
||||||
|
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
|
||||||
|
import { logJob } from "../../services/logging/log_job";
|
||||||
|
import { getJobPriority } from "../../lib/job-priority";
|
||||||
|
import { PlanType } from "../../types";
|
||||||
|
import { getMapResults } from "./map";
|
||||||
|
import { rerankDocuments } from "../../lib/extract/reranker";
|
||||||
|
import { generateBasicCompletion } from "../../lib/extract/completions";
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
export async function extractController(
|
||||||
|
req: RequestWithAuth<{}, ExtractResponse, ExtractRequest>,
|
||||||
|
res: Response<ExtractResponse>
|
||||||
|
) {
|
||||||
|
req.body = extractRequestSchema.parse(req.body);
|
||||||
|
let earlyReturn = false;
|
||||||
|
|
||||||
|
const origin = req.body.origin;
|
||||||
|
const timeout = req.body.timeout;
|
||||||
|
// const pageOptions = legacyScrapeOptions(req.body);
|
||||||
|
// const extractorOptions = req.body.extract ? legacyExtractorOptions(req.body.extract) : undefined;
|
||||||
|
const jobId = uuidv4();
|
||||||
|
|
||||||
|
const startTime = new Date().getTime();
|
||||||
|
const jobPriority = await getJobPriority({
|
||||||
|
plan: req.auth.plan as PlanType,
|
||||||
|
team_id: req.auth.team_id,
|
||||||
|
basePriority: 10,
|
||||||
|
});
|
||||||
|
|
||||||
|
const urls = req.body.urls;
|
||||||
|
const mappedDocuments: MapDocument[] = [];
|
||||||
|
|
||||||
|
const prompt = req.body.prompt;
|
||||||
|
const keywords = await generateBasicCompletion(`If the user's prompt is: "${prompt}", what are the most important keywords besides the extraction task? Output only the keywords, separated by commas.`);
|
||||||
|
|
||||||
|
for (const url of urls) {
|
||||||
|
if (url.endsWith("/*")) {
|
||||||
|
const mapResults = await getMapResults({
|
||||||
|
url: url.slice(0, -2),
|
||||||
|
search: req.body.prompt,
|
||||||
|
limit: 100,
|
||||||
|
ignoreSitemap: true,
|
||||||
|
includeSubdomains: false,
|
||||||
|
crawlerOptions: {},
|
||||||
|
teamId: req.auth.team_id,
|
||||||
|
plan: req.auth.plan,
|
||||||
|
origin: req.body.origin,
|
||||||
|
subId: req.acuc?.sub_id,
|
||||||
|
includeMetadata: true
|
||||||
|
});
|
||||||
|
// top 3 links
|
||||||
|
const top3Links = (mapResults.links as MapDocument[]).slice(0, 3);
|
||||||
|
console.log(top3Links);
|
||||||
|
// console.log(top3Links);
|
||||||
|
mappedDocuments.push(...(mapResults.links as MapDocument[]));
|
||||||
|
// transform mappedUrls to just documents
|
||||||
|
// we quickly rerank
|
||||||
|
const rerank = await rerankDocuments(mappedDocuments.map(x => `URL: ${x.url}\nTITLE: ${x.title}\nDESCRIPTION: ${x.description}`), "What URLs are most relevant to the following prompt: " + req.body.prompt.toLocaleLowerCase().replace("extract", " ").replace("extract ", " "));
|
||||||
|
console.log(rerank);
|
||||||
|
} else {
|
||||||
|
mappedDocuments.push({ url });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
req.body.urls = mappedDocuments.map(x => x.url);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// const job = await addScrapeJob(
|
||||||
|
// {
|
||||||
|
// url: req.body.url,
|
||||||
|
// mode: "single_urls",
|
||||||
|
// crawlerOptions: {},
|
||||||
|
// team_id: req.auth.team_id,
|
||||||
|
// plan: req.auth.plan,
|
||||||
|
// pageOptions,
|
||||||
|
// extractorOptions,
|
||||||
|
// origin: req.body.origin,
|
||||||
|
// is_scrape: true,
|
||||||
|
// },
|
||||||
|
// {},
|
||||||
|
// jobId,
|
||||||
|
// jobPriority
|
||||||
|
// );
|
||||||
|
|
||||||
|
// const totalWait = (req.body.waitFor ?? 0) + (req.body.actions ?? []).reduce((a,x) => (x.type === "wait" ? x.milliseconds : 0) + a, 0);
|
||||||
|
|
||||||
|
// let doc: any | undefined;
|
||||||
|
// try {
|
||||||
|
// doc = (await waitForJob(job.id, timeout + totalWait))[0];
|
||||||
|
// } catch (e) {
|
||||||
|
// Logger.error(`Error in scrapeController: ${e}`);
|
||||||
|
// if (e instanceof Error && e.message.startsWith("Job wait")) {
|
||||||
|
// return res.status(408).json({
|
||||||
|
// success: false,
|
||||||
|
// error: "Request timed out",
|
||||||
|
// });
|
||||||
|
// } else {
|
||||||
|
// return res.status(500).json({
|
||||||
|
// success: false,
|
||||||
|
// error: `(Internal server error) - ${e && e?.message ? e.message : e} ${
|
||||||
|
// extractorOptions && extractorOptions.mode !== "markdown"
|
||||||
|
// ? " - Could be due to LLM parsing issues"
|
||||||
|
// : ""
|
||||||
|
// }`,
|
||||||
|
// });
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
|
||||||
|
// await job.remove();
|
||||||
|
|
||||||
|
// if (!doc) {
|
||||||
|
// console.error("!!! PANIC DOC IS", doc, job);
|
||||||
|
// return res.status(200).json({
|
||||||
|
// success: true,
|
||||||
|
// warning: "No page found",
|
||||||
|
// data: doc,
|
||||||
|
// });
|
||||||
|
// }
|
||||||
|
|
||||||
|
// delete doc.index;
|
||||||
|
// delete doc.provider;
|
||||||
|
|
||||||
|
// const endTime = new Date().getTime();
|
||||||
|
// const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||||
|
// const numTokens =
|
||||||
|
// doc && doc.markdown
|
||||||
|
// ? numTokensFromString(doc.markdown, "gpt-3.5-turbo")
|
||||||
|
// : 0;
|
||||||
|
|
||||||
|
// let creditsToBeBilled = 1; // Assuming 1 credit per document
|
||||||
|
// if (earlyReturn) {
|
||||||
|
// // Don't bill if we're early returning
|
||||||
|
// return;
|
||||||
|
// }
|
||||||
|
// if(req.body.extract && req.body.formats.includes("extract")) {
|
||||||
|
// creditsToBeBilled = 5;
|
||||||
|
// }
|
||||||
|
|
||||||
|
// billTeam(req.auth.team_id, req.acuc?.sub_id, creditsToBeBilled).catch(error => {
|
||||||
|
// Logger.error(`Failed to bill team ${req.auth.team_id} for ${creditsToBeBilled} credits: ${error}`);
|
||||||
|
// // Optionally, you could notify an admin or add to a retry queue here
|
||||||
|
// });
|
||||||
|
|
||||||
|
// if (!pageOptions || !pageOptions.includeRawHtml) {
|
||||||
|
// if (doc && doc.rawHtml) {
|
||||||
|
// delete doc.rawHtml;
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
|
||||||
|
// if(pageOptions && pageOptions.includeExtract) {
|
||||||
|
// if(!pageOptions.includeMarkdown && doc && doc.markdown) {
|
||||||
|
// delete doc.markdown;
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
|
||||||
|
// logJob({
|
||||||
|
// job_id: jobId,
|
||||||
|
// success: true,
|
||||||
|
// message: "Scrape completed",
|
||||||
|
// num_docs: 1,
|
||||||
|
// docs: [doc],
|
||||||
|
// time_taken: timeTakenInSeconds,
|
||||||
|
// team_id: req.auth.team_id,
|
||||||
|
// mode: "scrape",
|
||||||
|
// url: req.body.url,
|
||||||
|
// crawlerOptions: {},
|
||||||
|
// pageOptions: pageOptions,
|
||||||
|
// origin: origin,
|
||||||
|
// extractor_options: extractorOptions,
|
||||||
|
// num_tokens: numTokens,
|
||||||
|
// });
|
||||||
|
|
||||||
|
return res.status(200).json({
|
||||||
|
success: true,
|
||||||
|
data: null,
|
||||||
|
scrape_id: origin?.includes("website") ? jobId : undefined,
|
||||||
|
});
|
||||||
|
}
|
@ -15,11 +15,11 @@ import {
|
|||||||
removeDuplicateUrls,
|
removeDuplicateUrls,
|
||||||
} from "../../lib/validateUrl";
|
} from "../../lib/validateUrl";
|
||||||
import { fireEngineMap } from "../../search/fireEngine";
|
import { fireEngineMap } from "../../search/fireEngine";
|
||||||
import { billTeam } from "../../services/billing/credit_billing";
|
|
||||||
import { logJob } from "../../services/logging/log_job";
|
|
||||||
import { performCosineSimilarity } from "../../lib/map-cosine";
|
import { performCosineSimilarity } from "../../lib/map-cosine";
|
||||||
import { Logger } from "../../lib/logger";
|
import { Logger } from "../../lib/logger";
|
||||||
import Redis from "ioredis";
|
import Redis from "ioredis";
|
||||||
|
import { billTeam } from "../../services/billing/credit_billing";
|
||||||
|
import { logJob } from "../../services/logging/log_job";
|
||||||
|
|
||||||
configDotenv();
|
configDotenv();
|
||||||
const redis = new Redis(process.env.REDIS_URL);
|
const redis = new Redis(process.env.REDIS_URL);
|
||||||
@ -29,35 +29,50 @@ const MAX_MAP_LIMIT = 5000;
|
|||||||
// Max Links that "Smart /map" can return
|
// Max Links that "Smart /map" can return
|
||||||
const MAX_FIRE_ENGINE_RESULTS = 1000;
|
const MAX_FIRE_ENGINE_RESULTS = 1000;
|
||||||
|
|
||||||
export async function mapController(
|
interface MapOptions {
|
||||||
req: RequestWithAuth<{}, MapResponse, MapRequest>,
|
url: string;
|
||||||
res: Response<MapResponse>
|
search?: string;
|
||||||
) {
|
limit?: number;
|
||||||
|
ignoreSitemap?: boolean;
|
||||||
|
includeSubdomains?: boolean;
|
||||||
|
crawlerOptions?: any;
|
||||||
|
teamId: string;
|
||||||
|
plan: string;
|
||||||
|
origin?: string;
|
||||||
|
subId?: string;
|
||||||
|
includeMetadata?: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getMapResults({
|
||||||
|
url,
|
||||||
|
search,
|
||||||
|
limit = MAX_MAP_LIMIT,
|
||||||
|
ignoreSitemap = false,
|
||||||
|
includeSubdomains = false,
|
||||||
|
crawlerOptions = {},
|
||||||
|
teamId,
|
||||||
|
plan,
|
||||||
|
origin,
|
||||||
|
subId,
|
||||||
|
includeMetadata = false,
|
||||||
|
}: MapOptions) {
|
||||||
const startTime = new Date().getTime();
|
const startTime = new Date().getTime();
|
||||||
|
|
||||||
req.body = mapRequestSchema.parse(req.body);
|
|
||||||
|
|
||||||
const limit: number = req.body.limit ?? MAX_MAP_LIMIT;
|
|
||||||
|
|
||||||
const id = uuidv4();
|
const id = uuidv4();
|
||||||
let links: string[] = [req.body.url];
|
let links: { url: string; title?: string; description?: string }[] = [{ url }];
|
||||||
|
|
||||||
const sc: StoredCrawl = {
|
const sc: StoredCrawl = {
|
||||||
originUrl: req.body.url,
|
originUrl: url,
|
||||||
crawlerOptions: legacyCrawlerOptions(req.body),
|
crawlerOptions,
|
||||||
pageOptions: {},
|
pageOptions: {},
|
||||||
team_id: req.auth.team_id,
|
team_id: teamId,
|
||||||
createdAt: Date.now(),
|
createdAt: Date.now(),
|
||||||
plan: req.auth.plan,
|
plan,
|
||||||
};
|
};
|
||||||
|
|
||||||
const crawler = crawlToCrawler(id, sc);
|
const crawler = crawlToCrawler(id, sc);
|
||||||
|
|
||||||
let urlWithoutWww = req.body.url.replace("www.", "");
|
let urlWithoutWww = url.replace("www.", "");
|
||||||
|
let mapUrl = search ? `"${search}" site:${urlWithoutWww}` : `site:${url}`;
|
||||||
let mapUrl = req.body.search
|
|
||||||
? `"${req.body.search}" site:${urlWithoutWww}`
|
|
||||||
: `site:${req.body.url}`;
|
|
||||||
|
|
||||||
const resultsPerPage = 100;
|
const resultsPerPage = 100;
|
||||||
const maxPages = Math.ceil(Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage);
|
const maxPages = Math.ceil(Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage);
|
||||||
@ -81,12 +96,11 @@ export async function mapController(
|
|||||||
pagePromises = Array.from({ length: maxPages }, (_, i) => fetchPage(i + 1));
|
pagePromises = Array.from({ length: maxPages }, (_, i) => fetchPage(i + 1));
|
||||||
allResults = await Promise.all(pagePromises);
|
allResults = await Promise.all(pagePromises);
|
||||||
|
|
||||||
await redis.set(cacheKey, JSON.stringify(allResults), "EX", 24 * 60 * 60); // Cache for 24 hours
|
await redis.set(cacheKey, JSON.stringify(allResults), "EX", 24 * 60 * 60);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Parallelize sitemap fetch with serper search
|
|
||||||
const [sitemap, ...searchResults] = await Promise.all([
|
const [sitemap, ...searchResults] = await Promise.all([
|
||||||
req.body.ignoreSitemap ? null : crawler.tryGetSitemap(),
|
ignoreSitemap ? null : crawler.tryGetSitemap(),
|
||||||
...(cachedResult ? [] : pagePromises),
|
...(cachedResult ? [] : pagePromises),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
@ -96,7 +110,7 @@ export async function mapController(
|
|||||||
|
|
||||||
if (sitemap !== null) {
|
if (sitemap !== null) {
|
||||||
sitemap.forEach((x) => {
|
sitemap.forEach((x) => {
|
||||||
links.push(x.url);
|
links.push({ url: x.url });
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -110,67 +124,96 @@ export async function mapController(
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (mapResults.length > 0) {
|
if (mapResults.length > 0) {
|
||||||
if (req.body.search) {
|
if (search) {
|
||||||
// Ensure all map results are first, maintaining their order
|
|
||||||
links = [
|
links = [
|
||||||
mapResults[0].url,
|
{ url: mapResults[0].url, title: mapResults[0].title, description: mapResults[0].description },
|
||||||
...mapResults.slice(1).map((x) => x.url),
|
...mapResults.slice(1).map((x) => ({
|
||||||
|
url: x.url,
|
||||||
|
title: x.title,
|
||||||
|
description: x.description
|
||||||
|
})),
|
||||||
...links,
|
...links,
|
||||||
];
|
];
|
||||||
} else {
|
} else {
|
||||||
mapResults.map((x) => {
|
mapResults.forEach((x) => {
|
||||||
links.push(x.url);
|
links.push({
|
||||||
|
url: x.url,
|
||||||
|
title: x.title,
|
||||||
|
description: x.description
|
||||||
|
});
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Perform cosine similarity between the search query and the list of links
|
if (search) {
|
||||||
if (req.body.search) {
|
const filteredLinks = performCosineSimilarity(links.map(l => l.url), search.toLowerCase());
|
||||||
const searchQuery = req.body.search.toLowerCase();
|
links = links.filter(l => filteredLinks.includes(l.url));
|
||||||
|
|
||||||
links = performCosineSimilarity(links, searchQuery);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
links = links
|
links = links
|
||||||
.map((x) => {
|
.map((x) => {
|
||||||
try {
|
try {
|
||||||
return checkAndUpdateURLForMap(x).url.trim();
|
return { ...x, url: checkAndUpdateURLForMap(x.url).url.trim() };
|
||||||
} catch (_) {
|
} catch (_) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
.filter((x) => x !== null);
|
.filter((x) => x !== null);
|
||||||
|
|
||||||
// allows for subdomains to be included
|
links = links.filter((x) => isSameDomain(x.url, url));
|
||||||
links = links.filter((x) => isSameDomain(x, req.body.url));
|
|
||||||
|
|
||||||
// if includeSubdomains is false, filter out subdomains
|
if (!includeSubdomains) {
|
||||||
if (!req.body.includeSubdomains) {
|
links = links.filter((x) => isSameSubdomain(x.url, url));
|
||||||
links = links.filter((x) => isSameSubdomain(x, req.body.url));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// remove duplicates that could be due to http/https or www
|
links = removeDuplicateUrls(links.map(l => l.url)).map(url => links.find(l => l.url === url));
|
||||||
links = removeDuplicateUrls(links);
|
|
||||||
|
|
||||||
billTeam(req.auth.team_id, req.acuc?.sub_id, 1).catch((error) => {
|
|
||||||
Logger.error(
|
|
||||||
`Failed to bill team ${req.auth.team_id} for 1 credit: ${error}`
|
|
||||||
);
|
|
||||||
// Optionally, you could notify an admin or add to a retry queue here
|
|
||||||
});
|
|
||||||
|
|
||||||
const endTime = new Date().getTime();
|
const endTime = new Date().getTime();
|
||||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||||
|
|
||||||
const linksToReturn = links.slice(0, limit);
|
const linksToReturn = links.slice(0, limit);
|
||||||
|
|
||||||
logJob({
|
return {
|
||||||
job_id: id,
|
links: includeMetadata ? linksToReturn : linksToReturn.map(l => l.url),
|
||||||
success: links.length > 0,
|
scrapeId: origin?.includes("website") ? id : undefined,
|
||||||
|
timeTakenInSeconds,
|
||||||
|
id,
|
||||||
|
linksLength: links.length,
|
||||||
|
linksToReturnLength: linksToReturn.length,
|
||||||
|
docs: linksToReturn.map(l => l.url),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function mapController(
|
||||||
|
req: RequestWithAuth<{}, MapResponse, MapRequest>,
|
||||||
|
res: Response<MapResponse>
|
||||||
|
) {
|
||||||
|
req.body = mapRequestSchema.parse(req.body);
|
||||||
|
|
||||||
|
const results = await getMapResults({
|
||||||
|
url: req.body.url,
|
||||||
|
search: req.body.search,
|
||||||
|
limit: req.body.limit,
|
||||||
|
ignoreSitemap: req.body.ignoreSitemap,
|
||||||
|
includeSubdomains: req.body.includeSubdomains,
|
||||||
|
crawlerOptions: legacyCrawlerOptions(req.body),
|
||||||
|
teamId: req.auth.team_id,
|
||||||
|
plan: req.auth.plan,
|
||||||
|
origin: req.body.origin,
|
||||||
|
subId: req.acuc?.sub_id,
|
||||||
|
});
|
||||||
|
|
||||||
|
await billTeam(req.auth.team_id, req.acuc?.sub_id, 1).catch((error) => {
|
||||||
|
Logger.error(`Failed to bill team ${req.auth.team_id} for 1 credit: ${error}`);
|
||||||
|
});
|
||||||
|
|
||||||
|
await logJob({
|
||||||
|
job_id: results.id,
|
||||||
|
success: results.linksLength > 0,
|
||||||
message: "Map completed",
|
message: "Map completed",
|
||||||
num_docs: linksToReturn.length,
|
num_docs: results.linksToReturnLength,
|
||||||
docs: linksToReturn,
|
docs: results.docs,
|
||||||
time_taken: timeTakenInSeconds,
|
time_taken: results.timeTakenInSeconds,
|
||||||
team_id: req.auth.team_id,
|
team_id: req.auth.team_id,
|
||||||
mode: "map",
|
mode: "map",
|
||||||
url: req.body.url,
|
url: req.body.url,
|
||||||
@ -183,55 +226,7 @@ export async function mapController(
|
|||||||
|
|
||||||
return res.status(200).json({
|
return res.status(200).json({
|
||||||
success: true,
|
success: true,
|
||||||
links: linksToReturn,
|
links: results.links.map(l => l.url),
|
||||||
scrape_id: req.body.origin?.includes("website") ? id : undefined,
|
scrape_id: results.scrapeId,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// Subdomain sitemap url checking
|
|
||||||
|
|
||||||
// // For each result, check for subdomains, get their sitemaps and add them to the links
|
|
||||||
// const processedUrls = new Set();
|
|
||||||
// const processedSubdomains = new Set();
|
|
||||||
|
|
||||||
// for (const result of links) {
|
|
||||||
// let url;
|
|
||||||
// let hostParts;
|
|
||||||
// try {
|
|
||||||
// url = new URL(result);
|
|
||||||
// hostParts = url.hostname.split('.');
|
|
||||||
// } catch (e) {
|
|
||||||
// continue;
|
|
||||||
// }
|
|
||||||
|
|
||||||
// console.log("hostParts", hostParts);
|
|
||||||
// // Check if it's a subdomain (more than 2 parts, and not 'www')
|
|
||||||
// if (hostParts.length > 2 && hostParts[0] !== 'www') {
|
|
||||||
// const subdomain = hostParts[0];
|
|
||||||
// console.log("subdomain", subdomain);
|
|
||||||
// const subdomainUrl = `${url.protocol}//${subdomain}.${hostParts.slice(-2).join('.')}`;
|
|
||||||
// console.log("subdomainUrl", subdomainUrl);
|
|
||||||
|
|
||||||
// if (!processedSubdomains.has(subdomainUrl)) {
|
|
||||||
// processedSubdomains.add(subdomainUrl);
|
|
||||||
|
|
||||||
// const subdomainCrawl = crawlToCrawler(id, {
|
|
||||||
// originUrl: subdomainUrl,
|
|
||||||
// crawlerOptions: legacyCrawlerOptions(req.body),
|
|
||||||
// pageOptions: {},
|
|
||||||
// team_id: req.auth.team_id,
|
|
||||||
// createdAt: Date.now(),
|
|
||||||
// plan: req.auth.plan,
|
|
||||||
// });
|
|
||||||
// const subdomainSitemap = await subdomainCrawl.tryGetSitemap();
|
|
||||||
// if (subdomainSitemap) {
|
|
||||||
// subdomainSitemap.forEach((x) => {
|
|
||||||
// if (!processedUrls.has(x.url)) {
|
|
||||||
// processedUrls.add(x.url);
|
|
||||||
// links.push(x.url);
|
|
||||||
// }
|
|
||||||
// });
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
|
@ -121,8 +121,21 @@ export const scrapeOptions = z.object({
|
|||||||
}).strict(strictMessage)
|
}).strict(strictMessage)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
export type ScrapeOptions = z.infer<typeof scrapeOptions>;
|
export type ScrapeOptions = z.infer<typeof scrapeOptions>;
|
||||||
|
|
||||||
|
export const extractV1Options = z.object({
|
||||||
|
urls: url.array(),
|
||||||
|
prompt: z.string().optional(),
|
||||||
|
schema: z.any().optional(),
|
||||||
|
origin: z.string().optional().default("api"),
|
||||||
|
timeout: z.number().int().positive().finite().safe().default(60000),
|
||||||
|
}).strict(strictMessage)
|
||||||
|
|
||||||
|
export type ExtractV1Options = z.infer<typeof extractV1Options>;
|
||||||
|
export const extractRequestSchema = extractV1Options;
|
||||||
|
export type ExtractRequest = z.infer<typeof extractRequestSchema>;
|
||||||
|
|
||||||
export const scrapeRequestSchema = scrapeOptions.extend({
|
export const scrapeRequestSchema = scrapeOptions.extend({
|
||||||
url,
|
url,
|
||||||
origin: z.string().optional().default("api"),
|
origin: z.string().optional().default("api"),
|
||||||
@ -142,6 +155,8 @@ export const scrapeRequestSchema = scrapeOptions.extend({
|
|||||||
return obj;
|
return obj;
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;
|
export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;
|
||||||
|
|
||||||
export const batchScrapeRequestSchema = scrapeOptions.extend({
|
export const batchScrapeRequestSchema = scrapeOptions.extend({
|
||||||
@ -296,6 +311,21 @@ export interface ScrapeResponseRequestTest {
|
|||||||
error?: string;
|
error?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export type ExtractResponse =
|
||||||
|
| ErrorResponse
|
||||||
|
| {
|
||||||
|
success: true;
|
||||||
|
warning?: string;
|
||||||
|
data: Document;
|
||||||
|
scrape_id?: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
export interface ExtractResponseRequestTest {
|
||||||
|
statusCode: number;
|
||||||
|
body: ExtractResponse;
|
||||||
|
error?: string;
|
||||||
|
}
|
||||||
|
|
||||||
export type CrawlResponse =
|
export type CrawlResponse =
|
||||||
| ErrorResponse
|
| ErrorResponse
|
||||||
| {
|
| {
|
||||||
@ -492,3 +522,11 @@ export function legacyDocumentConverter(doc: any): Document {
|
|||||||
},
|
},
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
export interface MapDocument {
|
||||||
|
url: string;
|
||||||
|
title?: string;
|
||||||
|
description?: string;
|
||||||
|
}
|
119
apps/api/src/lib/extract/completions.ts
Normal file
119
apps/api/src/lib/extract/completions.ts
Normal file
@ -0,0 +1,119 @@
|
|||||||
|
import OpenAI from "openai";
|
||||||
|
import { encoding_for_model } from "@dqbd/tiktoken";
|
||||||
|
import { TiktokenModel } from "@dqbd/tiktoken";
|
||||||
|
import { ExtractOptions } from "../../controllers/v1/types";
|
||||||
|
import { Document } from "../entities";
|
||||||
|
import { z } from "zod";
|
||||||
|
|
||||||
|
const maxTokens = 32000;
|
||||||
|
const modifier = 4;
|
||||||
|
|
||||||
|
export class LLMRefusalError extends Error {
|
||||||
|
constructor(refusal: string) {
|
||||||
|
super("LLM refused to extract the website's content");
|
||||||
|
this.name = "LLMRefusalError";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
interface GenerateCompletionsParams {
|
||||||
|
systemPrompt?: string;
|
||||||
|
prompt?: string;
|
||||||
|
schema?: any;
|
||||||
|
pagesContent: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function generateBasicCompletion(prompt: string) {
|
||||||
|
const openai = new OpenAI();
|
||||||
|
const model: TiktokenModel =
|
||||||
|
(process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
|
||||||
|
|
||||||
|
const completion = await openai.chat.completions.create({
|
||||||
|
model,
|
||||||
|
messages: [{ role: "user", content: prompt }],
|
||||||
|
});
|
||||||
|
|
||||||
|
return completion.choices[0].message.content;
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function generateFinalExtraction({
|
||||||
|
pagesContent,
|
||||||
|
systemPrompt,
|
||||||
|
prompt,
|
||||||
|
schema,
|
||||||
|
}: GenerateCompletionsParams): Promise<{
|
||||||
|
content: string;
|
||||||
|
metadata: { numTokens: number; warning: string };
|
||||||
|
}> {
|
||||||
|
const openai = new OpenAI();
|
||||||
|
const model: TiktokenModel =
|
||||||
|
(process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
|
||||||
|
|
||||||
|
let extractionContent = pagesContent;
|
||||||
|
let numTokens = 0;
|
||||||
|
let warning = "";
|
||||||
|
|
||||||
|
const encoder = encoding_for_model(model);
|
||||||
|
try {
|
||||||
|
const tokens = encoder.encode(extractionContent);
|
||||||
|
numTokens = tokens.length;
|
||||||
|
} catch (error) {
|
||||||
|
extractionContent = extractionContent.slice(0, maxTokens * modifier);
|
||||||
|
warning = `Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (${maxTokens}) we support.`;
|
||||||
|
} finally {
|
||||||
|
encoder.free();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (numTokens > maxTokens) {
|
||||||
|
extractionContent = extractionContent.slice(0, maxTokens * modifier);
|
||||||
|
warning = `The extraction content would have used more tokens (${numTokens}) than the maximum we allow (${maxTokens}). -- the input has been automatically trimmed.`;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (schema && (schema.type === "array" || schema._type === "ZodArray")) {
|
||||||
|
schema = {
|
||||||
|
type: "object",
|
||||||
|
properties: {
|
||||||
|
items: schema,
|
||||||
|
},
|
||||||
|
required: ["items"],
|
||||||
|
additionalProperties: false,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
const jsonCompletion = await openai.beta.chat.completions.parse({
|
||||||
|
model,
|
||||||
|
messages: [
|
||||||
|
{ role: "system", content: systemPrompt },
|
||||||
|
{ role: "user", content: [{ type: "text", text: extractionContent }] },
|
||||||
|
{
|
||||||
|
role: "user",
|
||||||
|
content: prompt
|
||||||
|
? `Transform the above content into structured JSON output based on the following user request: ${prompt}`
|
||||||
|
: "Transform the above content into structured JSON output.",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
response_format: schema
|
||||||
|
? {
|
||||||
|
type: "json_schema",
|
||||||
|
json_schema: {
|
||||||
|
name: "websiteContent",
|
||||||
|
schema: schema.shape,
|
||||||
|
strict: true,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
: { type: "json_object" },
|
||||||
|
});
|
||||||
|
|
||||||
|
if (jsonCompletion.choices[0].message.refusal !== null) {
|
||||||
|
throw new LLMRefusalError(jsonCompletion.choices[0].message.refusal);
|
||||||
|
}
|
||||||
|
|
||||||
|
const extraction = jsonCompletion.choices[0].message.parsed;
|
||||||
|
|
||||||
|
return {
|
||||||
|
content: extraction,
|
||||||
|
metadata: {
|
||||||
|
numTokens,
|
||||||
|
warning,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
22
apps/api/src/lib/extract/reranker.ts
Normal file
22
apps/api/src/lib/extract/reranker.ts
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
import { CohereClient } from "cohere-ai";
|
||||||
|
import { MapDocument } from "../../controllers/v1/types";
|
||||||
|
const cohere = new CohereClient({
|
||||||
|
token: process.env.COHERE_API_KEY,
|
||||||
|
});
|
||||||
|
|
||||||
|
export async function rerankDocuments(
|
||||||
|
documents: (string | Record<string, string>)[],
|
||||||
|
query: string,
|
||||||
|
topN = 3,
|
||||||
|
model = "rerank-english-v3.0"
|
||||||
|
) {
|
||||||
|
const rerank = await cohere.v2.rerank({
|
||||||
|
documents,
|
||||||
|
query,
|
||||||
|
topN,
|
||||||
|
model,
|
||||||
|
returnDocuments: true,
|
||||||
|
});
|
||||||
|
|
||||||
|
return rerank.results.sort((a, b) => b.relevanceScore - a.relevanceScore).map(x => ({ document: x.document, index: x.index, relevanceScore: x.relevanceScore }));
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user