firecrawl/apps/api/src/lib/extract/url-processor.ts
Nicolas 4b4385c520 Revert "Nick:"
This reverts commit 6718ce89085339eaaceb1e88a0aa45ecff3216ac.
2025-01-22 17:26:09 -03:00

232 lines
7.2 KiB
TypeScript

import { MapDocument, URLTrace } from "../../controllers/v1/types";
import { getMapResults } from "../../controllers/v1/map";
import { PlanType } from "../../types";
import { removeDuplicateUrls } from "../validateUrl";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
import { generateBasicCompletion } from "../LLM-extraction";
import { buildRefrasedPrompt } from "./build-prompts";
import { rerankLinksWithLLM } from "./reranker";
import { extractConfig } from "./config";
import { updateExtract } from "./extract-redis";
import { ExtractStep } from "./extract-redis";
interface ProcessUrlOptions {
url: string;
prompt?: string;
schema?: any;
teamId: string;
plan: PlanType;
allowExternalLinks?: boolean;
origin?: string;
limit?: number;
includeSubdomains?: boolean;
}
export async function processUrl(
options: ProcessUrlOptions,
urlTraces: URLTrace[],
updateExtractCallback: (links: string[]) => void,
): Promise<string[]> {
const trace: URLTrace = {
url: options.url,
status: "mapped",
timing: {
discoveredAt: new Date().toISOString(),
},
};
urlTraces.push(trace);
if (!options.url.includes("/*") && !options.allowExternalLinks) {
if (!isUrlBlocked(options.url)) {
trace.usedInCompletion = true;
return [options.url];
}
trace.status = "error";
trace.error = "URL is blocked";
trace.usedInCompletion = false;
return [];
}
const baseUrl = options.url.replace("/*", "");
let urlWithoutWww = baseUrl.replace("www.", "");
let rephrasedPrompt = options.prompt;
if (options.prompt) {
rephrasedPrompt =
(
await generateBasicCompletion(
buildRefrasedPrompt(options.prompt, baseUrl),
)
)
?.replace('"', "")
.replace("/", "") ?? options.prompt;
}
try {
const mapResults = await getMapResults({
url: baseUrl,
search: rephrasedPrompt,
teamId: options.teamId,
plan: options.plan,
allowExternalLinks: options.allowExternalLinks,
origin: options.origin,
limit: options.limit,
ignoreSitemap: false,
includeMetadata: true,
includeSubdomains: options.includeSubdomains,
});
let mappedLinks = mapResults.mapResults as MapDocument[];
let allUrls = [...mappedLinks.map((m) => m.url), ...mapResults.links];
let uniqueUrls = removeDuplicateUrls(allUrls);
// Track all discovered URLs
uniqueUrls.forEach((discoveredUrl) => {
if (!urlTraces.some((t) => t.url === discoveredUrl)) {
urlTraces.push({
url: discoveredUrl,
status: "mapped",
timing: {
discoveredAt: new Date().toISOString(),
},
usedInCompletion: false,
});
}
});
// retry if only one url is returned
if (uniqueUrls.length <= 1) {
const retryMapResults = await getMapResults({
url: baseUrl,
teamId: options.teamId,
plan: options.plan,
allowExternalLinks: options.allowExternalLinks,
origin: options.origin,
limit: options.limit,
ignoreSitemap: false,
includeMetadata: true,
includeSubdomains: options.includeSubdomains,
});
mappedLinks = retryMapResults.mapResults as MapDocument[];
allUrls = [...mappedLinks.map((m) => m.url), ...mapResults.links];
uniqueUrls = removeDuplicateUrls(allUrls);
// Track all discovered URLs
uniqueUrls.forEach((discoveredUrl) => {
if (!urlTraces.some((t) => t.url === discoveredUrl)) {
urlTraces.push({
url: discoveredUrl,
status: "mapped",
warning: "Broader search. Not limiting map results to prompt.",
timing: {
discoveredAt: new Date().toISOString(),
},
usedInCompletion: false,
});
}
});
}
// Track all discovered URLs
uniqueUrls.forEach((discoveredUrl) => {
if (!urlTraces.some((t) => t.url === discoveredUrl)) {
urlTraces.push({
url: discoveredUrl,
status: "mapped",
timing: {
discoveredAt: new Date().toISOString(),
},
usedInCompletion: false,
});
}
});
const existingUrls = new Set(mappedLinks.map((m) => m.url));
const newUrls = uniqueUrls.filter((url) => !existingUrls.has(url));
mappedLinks = [
...mappedLinks,
...newUrls.map((url) => ({ url, title: "", description: "" })),
];
if (mappedLinks.length === 0) {
mappedLinks = [{ url: baseUrl, title: "", description: "" }];
}
// Limit initial set of links (1000)
mappedLinks = mappedLinks.slice(
0,
extractConfig.RERANKING.MAX_INITIAL_RANKING_LIMIT,
);
updateExtractCallback(mappedLinks.map((x) => x.url));
// Perform reranking using either prompt or schema
let searchQuery = "";
if (options.prompt) {
searchQuery = options.allowExternalLinks
? `${options.prompt} ${urlWithoutWww}`
: `${options.prompt} site:${urlWithoutWww}`;
} else if (options.schema) {
// Generate search query from schema using basic completion
try {
const schemaString = JSON.stringify(options.schema, null, 2);
const prompt = `Given this JSON schema, generate a natural language search query that would help find relevant pages containing this type of data. Focus on the key properties and their descriptions and keep it very concise. Schema: ${schemaString}`;
searchQuery =
(await generateBasicCompletion(prompt)) ??
"Extract the data according to the schema: " + schemaString;
if (options.allowExternalLinks) {
searchQuery = `${searchQuery} ${urlWithoutWww}`;
} else {
searchQuery = `${searchQuery} site:${urlWithoutWww}`;
}
} catch (error) {
console.error("Error generating search query from schema:", error);
searchQuery = urlWithoutWww; // Fallback to just the domain
}
} else {
searchQuery = urlWithoutWww;
}
// dumpToFile(
// "mapped-links.txt",
// mappedLinks,
// (link, index) => `${index + 1}. URL: ${link.url}, Title: ${link.title}, Description: ${link.description}`
// );
const rerankerResult = await rerankLinksWithLLM(mappedLinks, searchQuery, urlTraces);
mappedLinks = rerankerResult.mapDocument;
let tokensUsed = rerankerResult.tokensUsed;
// 2nd Pass, useful for when the first pass returns too many links
if (mappedLinks.length > 100) {
const rerankerResult = await rerankLinksWithLLM(
mappedLinks,
searchQuery,
urlTraces,
);
mappedLinks = rerankerResult.mapDocument;
tokensUsed += rerankerResult.tokensUsed;
}
// dumpToFile(
// "llm-links.txt",
// mappedLinks,
// (link, index) => `${index + 1}. URL: ${link.url}, Title: ${link.title}, Description: ${link.description}`
// );
// Remove title and description from mappedLinks
mappedLinks = mappedLinks.map((link) => ({ url: link.url }));
return mappedLinks.map((x) => x.url);
} catch (error) {
trace.status = "error";
trace.error = error.message;
trace.usedInCompletion = false;
return [];
}
}