mirror of
https://github.com/mendableai/firecrawl.git
synced 2025-12-29 16:16:46 +00:00
232 lines
7.2 KiB
TypeScript
232 lines
7.2 KiB
TypeScript
import { MapDocument, URLTrace } from "../../controllers/v1/types";
|
|
import { getMapResults } from "../../controllers/v1/map";
|
|
import { PlanType } from "../../types";
|
|
import { removeDuplicateUrls } from "../validateUrl";
|
|
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
|
import { generateBasicCompletion } from "../LLM-extraction";
|
|
import { buildRefrasedPrompt } from "./build-prompts";
|
|
import { rerankLinksWithLLM } from "./reranker";
|
|
import { extractConfig } from "./config";
|
|
import { updateExtract } from "./extract-redis";
|
|
import { ExtractStep } from "./extract-redis";
|
|
|
|
interface ProcessUrlOptions {
|
|
url: string;
|
|
prompt?: string;
|
|
schema?: any;
|
|
teamId: string;
|
|
plan: PlanType;
|
|
allowExternalLinks?: boolean;
|
|
origin?: string;
|
|
limit?: number;
|
|
includeSubdomains?: boolean;
|
|
}
|
|
|
|
export async function processUrl(
|
|
options: ProcessUrlOptions,
|
|
urlTraces: URLTrace[],
|
|
updateExtractCallback: (links: string[]) => void,
|
|
): Promise<string[]> {
|
|
const trace: URLTrace = {
|
|
url: options.url,
|
|
status: "mapped",
|
|
timing: {
|
|
discoveredAt: new Date().toISOString(),
|
|
},
|
|
};
|
|
urlTraces.push(trace);
|
|
|
|
if (!options.url.includes("/*") && !options.allowExternalLinks) {
|
|
if (!isUrlBlocked(options.url)) {
|
|
trace.usedInCompletion = true;
|
|
return [options.url];
|
|
}
|
|
trace.status = "error";
|
|
trace.error = "URL is blocked";
|
|
trace.usedInCompletion = false;
|
|
return [];
|
|
}
|
|
|
|
const baseUrl = options.url.replace("/*", "");
|
|
let urlWithoutWww = baseUrl.replace("www.", "");
|
|
|
|
let rephrasedPrompt = options.prompt;
|
|
if (options.prompt) {
|
|
rephrasedPrompt =
|
|
(
|
|
await generateBasicCompletion(
|
|
buildRefrasedPrompt(options.prompt, baseUrl),
|
|
)
|
|
)
|
|
?.replace('"', "")
|
|
.replace("/", "") ?? options.prompt;
|
|
}
|
|
|
|
try {
|
|
const mapResults = await getMapResults({
|
|
url: baseUrl,
|
|
search: rephrasedPrompt,
|
|
teamId: options.teamId,
|
|
plan: options.plan,
|
|
allowExternalLinks: options.allowExternalLinks,
|
|
origin: options.origin,
|
|
limit: options.limit,
|
|
ignoreSitemap: false,
|
|
includeMetadata: true,
|
|
includeSubdomains: options.includeSubdomains,
|
|
});
|
|
|
|
let mappedLinks = mapResults.mapResults as MapDocument[];
|
|
let allUrls = [...mappedLinks.map((m) => m.url), ...mapResults.links];
|
|
let uniqueUrls = removeDuplicateUrls(allUrls);
|
|
|
|
// Track all discovered URLs
|
|
uniqueUrls.forEach((discoveredUrl) => {
|
|
if (!urlTraces.some((t) => t.url === discoveredUrl)) {
|
|
urlTraces.push({
|
|
url: discoveredUrl,
|
|
status: "mapped",
|
|
timing: {
|
|
discoveredAt: new Date().toISOString(),
|
|
},
|
|
usedInCompletion: false,
|
|
});
|
|
}
|
|
});
|
|
|
|
// retry if only one url is returned
|
|
if (uniqueUrls.length <= 1) {
|
|
const retryMapResults = await getMapResults({
|
|
url: baseUrl,
|
|
teamId: options.teamId,
|
|
plan: options.plan,
|
|
allowExternalLinks: options.allowExternalLinks,
|
|
origin: options.origin,
|
|
limit: options.limit,
|
|
ignoreSitemap: false,
|
|
includeMetadata: true,
|
|
includeSubdomains: options.includeSubdomains,
|
|
});
|
|
|
|
mappedLinks = retryMapResults.mapResults as MapDocument[];
|
|
allUrls = [...mappedLinks.map((m) => m.url), ...mapResults.links];
|
|
uniqueUrls = removeDuplicateUrls(allUrls);
|
|
|
|
// Track all discovered URLs
|
|
uniqueUrls.forEach((discoveredUrl) => {
|
|
if (!urlTraces.some((t) => t.url === discoveredUrl)) {
|
|
urlTraces.push({
|
|
url: discoveredUrl,
|
|
status: "mapped",
|
|
warning: "Broader search. Not limiting map results to prompt.",
|
|
timing: {
|
|
discoveredAt: new Date().toISOString(),
|
|
},
|
|
usedInCompletion: false,
|
|
});
|
|
}
|
|
});
|
|
}
|
|
|
|
// Track all discovered URLs
|
|
uniqueUrls.forEach((discoveredUrl) => {
|
|
if (!urlTraces.some((t) => t.url === discoveredUrl)) {
|
|
urlTraces.push({
|
|
url: discoveredUrl,
|
|
status: "mapped",
|
|
timing: {
|
|
discoveredAt: new Date().toISOString(),
|
|
},
|
|
usedInCompletion: false,
|
|
});
|
|
}
|
|
});
|
|
|
|
const existingUrls = new Set(mappedLinks.map((m) => m.url));
|
|
const newUrls = uniqueUrls.filter((url) => !existingUrls.has(url));
|
|
|
|
mappedLinks = [
|
|
...mappedLinks,
|
|
...newUrls.map((url) => ({ url, title: "", description: "" })),
|
|
];
|
|
|
|
if (mappedLinks.length === 0) {
|
|
mappedLinks = [{ url: baseUrl, title: "", description: "" }];
|
|
}
|
|
|
|
// Limit initial set of links (1000)
|
|
mappedLinks = mappedLinks.slice(
|
|
0,
|
|
extractConfig.RERANKING.MAX_INITIAL_RANKING_LIMIT,
|
|
);
|
|
|
|
|
|
updateExtractCallback(mappedLinks.map((x) => x.url));
|
|
|
|
|
|
// Perform reranking using either prompt or schema
|
|
let searchQuery = "";
|
|
if (options.prompt) {
|
|
searchQuery = options.allowExternalLinks
|
|
? `${options.prompt} ${urlWithoutWww}`
|
|
: `${options.prompt} site:${urlWithoutWww}`;
|
|
} else if (options.schema) {
|
|
// Generate search query from schema using basic completion
|
|
try {
|
|
const schemaString = JSON.stringify(options.schema, null, 2);
|
|
const prompt = `Given this JSON schema, generate a natural language search query that would help find relevant pages containing this type of data. Focus on the key properties and their descriptions and keep it very concise. Schema: ${schemaString}`;
|
|
|
|
searchQuery =
|
|
(await generateBasicCompletion(prompt)) ??
|
|
"Extract the data according to the schema: " + schemaString;
|
|
|
|
if (options.allowExternalLinks) {
|
|
searchQuery = `${searchQuery} ${urlWithoutWww}`;
|
|
} else {
|
|
searchQuery = `${searchQuery} site:${urlWithoutWww}`;
|
|
}
|
|
} catch (error) {
|
|
console.error("Error generating search query from schema:", error);
|
|
searchQuery = urlWithoutWww; // Fallback to just the domain
|
|
}
|
|
} else {
|
|
searchQuery = urlWithoutWww;
|
|
}
|
|
|
|
// dumpToFile(
|
|
// "mapped-links.txt",
|
|
// mappedLinks,
|
|
// (link, index) => `${index + 1}. URL: ${link.url}, Title: ${link.title}, Description: ${link.description}`
|
|
// );
|
|
|
|
const rerankerResult = await rerankLinksWithLLM(mappedLinks, searchQuery, urlTraces);
|
|
mappedLinks = rerankerResult.mapDocument;
|
|
let tokensUsed = rerankerResult.tokensUsed;
|
|
|
|
// 2nd Pass, useful for when the first pass returns too many links
|
|
if (mappedLinks.length > 100) {
|
|
const rerankerResult = await rerankLinksWithLLM(
|
|
mappedLinks,
|
|
searchQuery,
|
|
urlTraces,
|
|
);
|
|
mappedLinks = rerankerResult.mapDocument;
|
|
tokensUsed += rerankerResult.tokensUsed;
|
|
}
|
|
|
|
// dumpToFile(
|
|
// "llm-links.txt",
|
|
// mappedLinks,
|
|
// (link, index) => `${index + 1}. URL: ${link.url}, Title: ${link.title}, Description: ${link.description}`
|
|
// );
|
|
// Remove title and description from mappedLinks
|
|
mappedLinks = mappedLinks.map((link) => ({ url: link.url }));
|
|
return mappedLinks.map((x) => x.url);
|
|
} catch (error) {
|
|
trace.status = "error";
|
|
trace.error = error.message;
|
|
trace.usedInCompletion = false;
|
|
return [];
|
|
}
|
|
}
|