mirror of
https://github.com/mendableai/firecrawl.git
synced 2025-08-04 14:56:48 +00:00
327 lines
11 KiB
TypeScript
327 lines
11 KiB
TypeScript
import { Logger } from "winston";
|
|
import * as Sentry from "@sentry/node";
|
|
|
|
import { Document, ScrapeOptions } from "../../controllers/v1/types";
|
|
import { logger } from "../../lib/logger";
|
|
import { buildFallbackList, Engine, EngineScrapeResult, FeatureFlag, scrapeURLWithEngine } from "./engines";
|
|
import { parseMarkdown } from "../../lib/html-to-markdown";
|
|
import { AddFeatureError, EngineError, NoEnginesLeftError, SiteError, TimeoutError } from "./error";
|
|
import { executeTransformers } from "./transformers";
|
|
import { LLMRefusalError } from "./transformers/llmExtract";
|
|
import { urlSpecificParams } from "./lib/urlSpecificParams";
|
|
|
|
export type ScrapeUrlResponse = ({
|
|
success: true,
|
|
document: Document,
|
|
} | {
|
|
success: false,
|
|
error: any,
|
|
}) & {
|
|
logs: any[],
|
|
engines: EngineResultsTracker,
|
|
}
|
|
|
|
export type Meta = {
|
|
id: string;
|
|
url: string;
|
|
options: ScrapeOptions;
|
|
internalOptions: InternalOptions;
|
|
logger: Logger;
|
|
logs: any[];
|
|
featureFlags: Set<FeatureFlag>;
|
|
}
|
|
|
|
function buildFeatureFlags(url: string, options: ScrapeOptions, internalOptions: InternalOptions): Set<FeatureFlag> {
|
|
const flags: Set<FeatureFlag> = new Set();
|
|
|
|
if (options.actions !== undefined) {
|
|
flags.add("actions");
|
|
}
|
|
|
|
if (options.formats.includes("screenshot")) {
|
|
flags.add("screenshot");
|
|
}
|
|
|
|
if (options.formats.includes("screenshot@fullPage")) {
|
|
flags.add("screenshot@fullScreen");
|
|
}
|
|
|
|
if (options.waitFor !== 0) {
|
|
flags.add("waitFor");
|
|
}
|
|
|
|
if (internalOptions.atsv) {
|
|
flags.add("atsv");
|
|
}
|
|
|
|
if (options.location || options.geolocation) {
|
|
flags.add("location");
|
|
}
|
|
|
|
if (options.mobile) {
|
|
flags.add("mobile");
|
|
}
|
|
|
|
if (options.skipTlsVerification) {
|
|
flags.add("skipTlsVerification");
|
|
}
|
|
|
|
if (internalOptions.v0UseFastMode) {
|
|
flags.add("useFastMode");
|
|
}
|
|
|
|
const urlO = new URL(url);
|
|
|
|
if (urlO.pathname.endsWith(".pdf")) {
|
|
flags.add("pdf");
|
|
}
|
|
|
|
if (urlO.pathname.endsWith(".docx")) {
|
|
flags.add("docx");
|
|
}
|
|
|
|
return flags;
|
|
}
|
|
|
|
// The meta object contains all required information to perform a scrape.
|
|
// For example, the scrape ID, URL, options, feature flags, logs that occur while scraping.
|
|
// The meta object is usually immutable, except for the logs array, and in edge cases (e.g. a new feature is suddenly required)
|
|
// Having a meta object that is treated as immutable helps the code stay clean and easily tracable,
|
|
// while also retaining the benefits that WebScraper had from its OOP design.
|
|
function buildMetaObject(id: string, url: string, options: ScrapeOptions, internalOptions: InternalOptions): Meta {
|
|
const specParams = urlSpecificParams[new URL(url).hostname.replace(/^www\./, "")];
|
|
if (specParams !== undefined) {
|
|
options = Object.assign(options, specParams.scrapeOptions);
|
|
internalOptions = Object.assign(internalOptions, specParams.internalOptions);
|
|
}
|
|
|
|
const _logger = logger.child({ module: "ScrapeURL", scrapeId: id, scrapeURL: url });
|
|
const logs: any[] = [];
|
|
|
|
return {
|
|
id, url, options, internalOptions,
|
|
logger: _logger,
|
|
logs,
|
|
featureFlags: buildFeatureFlags(url, options, internalOptions),
|
|
};
|
|
}
|
|
|
|
export type InternalOptions = {
|
|
priority?: number; // Passed along to fire-engine
|
|
forceEngine?: Engine;
|
|
atsv?: boolean; // anti-bot solver, beta
|
|
|
|
v0CrawlOnlyUrls?: boolean;
|
|
v0UseFastMode?: boolean;
|
|
v0DisableJsDom?: boolean;
|
|
};
|
|
|
|
export type EngineResultsTracker = { [E in Engine]?: ({
|
|
state: "error",
|
|
error: any,
|
|
unexpected: boolean,
|
|
} | {
|
|
state: "success",
|
|
result: EngineScrapeResult & { markdown: string },
|
|
factors: Record<string, boolean>,
|
|
unsupportedFeatures: Set<FeatureFlag>,
|
|
} | {
|
|
state: "timeout",
|
|
}) & {
|
|
startedAt: number,
|
|
finishedAt: number,
|
|
} };
|
|
|
|
export type EngineScrapeResultWithContext = {
|
|
engine: Engine,
|
|
unsupportedFeatures: Set<FeatureFlag>,
|
|
result: (EngineScrapeResult & { markdown: string }),
|
|
};
|
|
|
|
function safeguardCircularError<T>(error: T): T {
|
|
if (typeof error === "object" && error !== null && (error as any).results) {
|
|
const newError = structuredClone(error);
|
|
delete (newError as any).results;
|
|
return newError;
|
|
} else {
|
|
return error;
|
|
}
|
|
}
|
|
|
|
async function scrapeURLLoop(
|
|
meta: Meta
|
|
): Promise<ScrapeUrlResponse> {
|
|
meta.logger.info(`Scraping URL ${JSON.stringify(meta.url)}...`,);
|
|
|
|
// TODO: handle sitemap data, see WebScraper/index.ts:280
|
|
// TODO: ScrapeEvents
|
|
|
|
const fallbackList = buildFallbackList(meta);
|
|
|
|
const results: EngineResultsTracker = {};
|
|
let result: EngineScrapeResultWithContext | null = null;
|
|
|
|
for (const { engine, unsupportedFeatures } of fallbackList) {
|
|
const startedAt = Date.now();
|
|
try {
|
|
meta.logger.info("Scraping via " + engine + "...");
|
|
const _engineResult = await scrapeURLWithEngine(meta, engine);
|
|
if (_engineResult.markdown === undefined) { // Some engines emit Markdown directly.
|
|
_engineResult.markdown = await parseMarkdown(_engineResult.html);
|
|
}
|
|
const engineResult = _engineResult as EngineScrapeResult & { markdown: string };
|
|
|
|
// Success factors
|
|
const isLongEnough = engineResult.markdown.length >= 20;
|
|
const isGoodStatusCode = (engineResult.statusCode >= 200 && engineResult.statusCode < 300) || engineResult.statusCode === 304;
|
|
const hasNoPageError = engineResult.error === undefined;
|
|
|
|
results[engine] = {
|
|
state: "success",
|
|
result: engineResult,
|
|
factors: { isLongEnough, isGoodStatusCode, hasNoPageError },
|
|
unsupportedFeatures,
|
|
startedAt,
|
|
finishedAt: Date.now(),
|
|
};
|
|
|
|
// NOTE: TODO: what to do when status code is bad is tough...
|
|
// we cannot just rely on text because error messages can be brief and not hit the limit
|
|
// should we just use all the fallbacks and pick the one with the longest text? - mogery
|
|
if (isLongEnough || !isGoodStatusCode) {
|
|
meta.logger.info("Scrape via " + engine + " deemed successful.", { factors: { isLongEnough, isGoodStatusCode, hasNoPageError } });
|
|
result = {
|
|
engine,
|
|
unsupportedFeatures,
|
|
result: engineResult as EngineScrapeResult & { markdown: string }
|
|
};
|
|
break;
|
|
}
|
|
} catch (error) {
|
|
if (error instanceof EngineError) {
|
|
meta.logger.info("Engine " + engine + " could not scrape the page.", { error });
|
|
results[engine] = {
|
|
state: "error",
|
|
error: safeguardCircularError(error),
|
|
unexpected: false,
|
|
startedAt,
|
|
finishedAt: Date.now(),
|
|
};
|
|
} else if (error instanceof TimeoutError) {
|
|
meta.logger.info("Engine " + engine + " timed out while scraping.", { error });
|
|
results[engine] = {
|
|
state: "timeout",
|
|
startedAt,
|
|
finishedAt: Date.now(),
|
|
};
|
|
} else if (error instanceof AddFeatureError) {
|
|
throw error;
|
|
} else if (error instanceof LLMRefusalError) {
|
|
results[engine] = {
|
|
state: "error",
|
|
error: safeguardCircularError(error),
|
|
unexpected: true,
|
|
startedAt,
|
|
finishedAt: Date.now(),
|
|
}
|
|
error.results = results;
|
|
meta.logger.warn("LLM refusal encountered", { error });
|
|
throw error;
|
|
} else if (error instanceof SiteError) {
|
|
throw error;
|
|
} else {
|
|
Sentry.captureException(error);
|
|
meta.logger.info("An unexpected error happened while scraping with " + engine + ".", { error });
|
|
results[engine] = {
|
|
state: "error",
|
|
error: safeguardCircularError(error),
|
|
unexpected: true,
|
|
startedAt,
|
|
finishedAt: Date.now(),
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (result === null) {
|
|
throw new NoEnginesLeftError(fallbackList.map(x => x.engine), results);
|
|
}
|
|
|
|
let document: Document = {
|
|
markdown: result.result.markdown,
|
|
rawHtml: result.result.html,
|
|
screenshot: result.result.screenshot,
|
|
actions: result.result.actions,
|
|
metadata: {
|
|
sourceURL: meta.url,
|
|
url: result.result.url,
|
|
statusCode: result.result.statusCode,
|
|
error: result.result.error,
|
|
},
|
|
}
|
|
|
|
if (result.unsupportedFeatures.size > 0) {
|
|
const warning = `The engine used does not support the following features: ${[...result.unsupportedFeatures].join(", ")} -- your scrape may be partial.`;
|
|
meta.logger.warn(warning, { engine: result.engine, unsupportedFeatures: result.unsupportedFeatures });
|
|
document.warning = document.warning !== undefined ? document.warning + " " + warning : warning;
|
|
}
|
|
|
|
document = await executeTransformers(meta, document);
|
|
|
|
return {
|
|
success: true,
|
|
document,
|
|
logs: meta.logs,
|
|
engines: results,
|
|
};
|
|
}
|
|
|
|
export async function scrapeURL(
|
|
id: string,
|
|
url: string,
|
|
options: ScrapeOptions,
|
|
internalOptions: InternalOptions = {},
|
|
): Promise<ScrapeUrlResponse> {
|
|
const meta = buildMetaObject(id, url, options, internalOptions);
|
|
try {
|
|
while (true) {
|
|
try {
|
|
return await scrapeURLLoop(meta);
|
|
} catch (error) {
|
|
if (error instanceof AddFeatureError && meta.internalOptions.forceEngine === undefined) {
|
|
meta.logger.debug("More feature flags requested by scraper: adding " + error.featureFlags.join(", "), { error, existingFlags: meta.featureFlags });
|
|
meta.featureFlags = new Set([...meta.featureFlags].concat(error.featureFlags));
|
|
} else {
|
|
throw error;
|
|
}
|
|
}
|
|
}
|
|
} catch (error) {
|
|
let results: EngineResultsTracker = {};
|
|
|
|
if (error instanceof NoEnginesLeftError) {
|
|
meta.logger.warn("scrapeURL: All scraping engines failed!", { error });
|
|
results = error.results;
|
|
} else if (error instanceof LLMRefusalError) {
|
|
meta.logger.warn("scrapeURL: LLM refused to extract content", { error });
|
|
results = error.results!;
|
|
} else if (error instanceof Error && error.message.includes("Invalid schema for response_format")) { // TODO: seperate into custom error
|
|
meta.logger.warn("scrapeURL: LLM schema error", { error });
|
|
// TODO: results?
|
|
} else if (error instanceof SiteError) {
|
|
meta.logger.warn("scrapeURL: Site failed to load in browser", { error });
|
|
} else {
|
|
Sentry.captureException(error);
|
|
meta.logger.error("scrapeURL: Unexpected error happened", { error });
|
|
// TODO: results?
|
|
}
|
|
|
|
return {
|
|
success: false,
|
|
error,
|
|
logs: meta.logs,
|
|
engines: results,
|
|
}
|
|
}
|
|
}
|