mirror of
https://github.com/mendableai/firecrawl.git
synced 2025-10-20 20:41:30 +00:00

* feat: use strictNullChecking * feat: switch logger to Winston * feat(scrapeURL): first batch * fix(scrapeURL): error swallow * fix(scrapeURL): add timeout to EngineResultsTracker * fix(scrapeURL): report unexpected error to sentry * chore: remove unused modules * feat(transfomers/coerce): warn when a format's response is missing * feat(scrapeURL): feature flag priorities, engine quality sorting, PDF and DOCX support * (add note) * feat(scrapeURL): wip readme * feat(scrapeURL): LLM extract * feat(scrapeURL): better warnings * fix(scrapeURL/engines/fire-engine;playwright): fix screenshot * feat(scrapeURL): add forceEngine internal option * feat(scrapeURL/engines): scrapingbee * feat(scrapeURL/transformars): uploadScreenshot * feat(scrapeURL): more intense tests * bunch of stuff * get rid of WebScraper (mostly) * adapt batch scrape * add staging deploy workflow * fix yaml * fix logger issues * fix v1 test schema * feat(scrapeURL/fire-engine/chrome-cdp): remove wait inserts on actions * scrapeURL: v0 backwards compat * logger fixes * feat(scrapeurl): v0 returnOnlyUrls support * fix(scrapeURL/v0): URL leniency * fix(batch-scrape): ts non-nullable * fix(scrapeURL/fire-engine/chromecdp): fix wait action * fix(logger): remove error debug key * feat(requests.http): use dotenv expression * fix(scrapeURL/extractMetadata): extract custom metadata * fix crawl option conversion * feat(scrapeURL): Add retry logic to robustFetch * fix(scrapeURL): crawl stuff * fix(scrapeURL): LLM extract * fix(scrapeURL/v0): search fix * fix(tests/v0): grant larger response size to v0 crawl status * feat(scrapeURL): basic fetch engine * feat(scrapeURL): playwright engine * feat(scrapeURL): add url-specific parameters * Update readme and examples * added e2e tests for most parameters. Still a few actions, location and iframes to be done. * fixed type * Nick: * Update scrape.ts * Update index.ts * added actions and base64 check * Nick: skipTls feature flag? * 403 * todo * todo * fixes * yeet headers from url specific params * add warning when final engine has feature deficit * expose engine results tracker for ScrapeEvents implementation * ingest scrape events * fixed some tests * comment * Update index.test.ts * fixed rawHtml * Update index.test.ts * update comments * move geolocation to global f-e option, fix removeBase64Images * Nick: * trim url-specific params * Update index.ts --------- Co-authored-by: Eric Ciarla <ericciarla@yahoo.com> Co-authored-by: rafaelmmiller <8574157+rafaelmmiller@users.noreply.github.com> Co-authored-by: Nicolas <nicolascamara29@gmail.com>
296 lines
9.5 KiB
TypeScript
296 lines
9.5 KiB
TypeScript
import { ScrapeActionContent } from "../../../lib/entities";
|
|
import { Meta } from "..";
|
|
import { scrapeDOCX } from "./docx";
|
|
import { scrapeURLWithFireEngineChromeCDP, scrapeURLWithFireEnginePlaywright, scrapeURLWithFireEngineTLSClient } from "./fire-engine";
|
|
import { scrapePDF } from "./pdf";
|
|
import { scrapeURLWithScrapingBee } from "./scrapingbee";
|
|
import { scrapeURLWithFetch } from "./fetch";
|
|
import { scrapeURLWithPlaywright } from "./playwright";
|
|
|
|
export type Engine = "fire-engine;chrome-cdp" | "fire-engine;playwright" | "fire-engine;tlsclient" | "scrapingbee" | "scrapingbeeLoad" | "playwright" | "fetch" | "pdf" | "docx";
|
|
|
|
const useScrapingBee = process.env.SCRAPING_BEE_API_KEY !== '' && process.env.SCRAPING_BEE_API_KEY !== undefined;
|
|
const useFireEngine = process.env.FIRE_ENGINE_BETA_URL !== '' && process.env.FIRE_ENGINE_BETA_URL !== undefined;
|
|
const usePlaywright = process.env.PLAYWRIGHT_MICROSERVICE_URL !== '' && process.env.PLAYWRIGHT_MICROSERVICE_URL !== undefined;
|
|
|
|
export const engines: Engine[] = [
|
|
...(useFireEngine ? [ "fire-engine;chrome-cdp" as const, "fire-engine;playwright" as const, "fire-engine;tlsclient" as const ] : []),
|
|
...(useScrapingBee ? [ "scrapingbee" as const, "scrapingbeeLoad" as const ] : []),
|
|
...(usePlaywright ? [ "playwright" as const ] : []),
|
|
"fetch",
|
|
"pdf",
|
|
"docx",
|
|
];
|
|
|
|
export const featureFlags = [
|
|
"actions",
|
|
"waitFor",
|
|
"screenshot",
|
|
"screenshot@fullScreen",
|
|
"pdf",
|
|
"docx",
|
|
"atsv",
|
|
"location",
|
|
"mobile",
|
|
"skipTlsVerification",
|
|
"useFastMode",
|
|
] as const;
|
|
|
|
export type FeatureFlag = typeof featureFlags[number];
|
|
|
|
export const featureFlagOptions: {
|
|
[F in FeatureFlag]: {
|
|
priority: number;
|
|
}
|
|
} = {
|
|
"actions": { priority: 20 },
|
|
"waitFor": { priority: 1 },
|
|
"screenshot": { priority: 10 },
|
|
"screenshot@fullScreen": { priority: 10 },
|
|
"pdf": { priority: 100 },
|
|
"docx": { priority: 100 },
|
|
"atsv": { priority: 90 }, // NOTE: should atsv force to tlsclient? adjust priority if not
|
|
"useFastMode": { priority: 90 },
|
|
"location": { priority: 10 },
|
|
"mobile": { priority: 10 },
|
|
"skipTlsVerification": { priority: 10 },
|
|
} as const;
|
|
|
|
export type EngineScrapeResult = {
|
|
url: string;
|
|
|
|
html: string;
|
|
markdown?: string;
|
|
statusCode: number;
|
|
error?: string;
|
|
|
|
screenshot?: string;
|
|
actions?: {
|
|
screenshots: string[];
|
|
scrapes: ScrapeActionContent[];
|
|
};
|
|
}
|
|
|
|
const engineHandlers: {
|
|
[E in Engine]: (meta: Meta) => Promise<EngineScrapeResult>
|
|
} = {
|
|
"fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP,
|
|
"fire-engine;playwright": scrapeURLWithFireEnginePlaywright,
|
|
"fire-engine;tlsclient": scrapeURLWithFireEngineTLSClient,
|
|
"scrapingbee": scrapeURLWithScrapingBee("domcontentloaded"),
|
|
"scrapingbeeLoad": scrapeURLWithScrapingBee("networkidle2"),
|
|
"playwright": scrapeURLWithPlaywright,
|
|
"fetch": scrapeURLWithFetch,
|
|
"pdf": scrapePDF,
|
|
"docx": scrapeDOCX,
|
|
};
|
|
|
|
export const engineOptions: {
|
|
[E in Engine]: {
|
|
// A list of feature flags the engine supports.
|
|
features: { [F in FeatureFlag]: boolean },
|
|
|
|
// This defines the order of engines in general. The engine with the highest quality will be used the most.
|
|
// Negative quality numbers are reserved for specialty engines, e.g. PDF and DOCX
|
|
quality: number,
|
|
}
|
|
} = {
|
|
"fire-engine;chrome-cdp": {
|
|
features: {
|
|
"actions": true,
|
|
"waitFor": true, // through actions transform
|
|
"screenshot": true, // through actions transform
|
|
"screenshot@fullScreen": true, // through actions transform
|
|
"pdf": false,
|
|
"docx": false,
|
|
"atsv": false,
|
|
"location": true,
|
|
"mobile": true,
|
|
"skipTlsVerification": true,
|
|
"useFastMode": false,
|
|
},
|
|
quality: 50,
|
|
},
|
|
"fire-engine;playwright": {
|
|
features: {
|
|
"actions": false,
|
|
"waitFor": true,
|
|
"screenshot": true,
|
|
"screenshot@fullScreen": true,
|
|
"pdf": false,
|
|
"docx": false,
|
|
"atsv": false,
|
|
"location": false,
|
|
"mobile": false,
|
|
"skipTlsVerification": false,
|
|
"useFastMode": false,
|
|
},
|
|
quality: 40,
|
|
},
|
|
"scrapingbee": {
|
|
features: {
|
|
"actions": false,
|
|
"waitFor": true,
|
|
"screenshot": true,
|
|
"screenshot@fullScreen": true,
|
|
"pdf": false,
|
|
"docx": false,
|
|
"atsv": false,
|
|
"location": false,
|
|
"mobile": false,
|
|
"skipTlsVerification": false,
|
|
"useFastMode": false,
|
|
},
|
|
quality: 30,
|
|
},
|
|
"scrapingbeeLoad": {
|
|
features: {
|
|
"actions": false,
|
|
"waitFor": true,
|
|
"screenshot": true,
|
|
"screenshot@fullScreen": true,
|
|
"pdf": false,
|
|
"docx": false,
|
|
"atsv": false,
|
|
"location": false,
|
|
"mobile": false,
|
|
"skipTlsVerification": false,
|
|
"useFastMode": false,
|
|
},
|
|
quality: 29,
|
|
},
|
|
"playwright": {
|
|
features: {
|
|
"actions": false,
|
|
"waitFor": true,
|
|
"screenshot": false,
|
|
"screenshot@fullScreen": false,
|
|
"pdf": false,
|
|
"docx": false,
|
|
"atsv": false,
|
|
"location": false,
|
|
"mobile": false,
|
|
"skipTlsVerification": false,
|
|
"useFastMode": false,
|
|
},
|
|
quality: 20,
|
|
},
|
|
"fire-engine;tlsclient": {
|
|
features: {
|
|
"actions": false,
|
|
"waitFor": false,
|
|
"screenshot": false,
|
|
"screenshot@fullScreen": false,
|
|
"pdf": false,
|
|
"docx": false,
|
|
"atsv": true,
|
|
"location": true,
|
|
"mobile": false,
|
|
"skipTlsVerification": false,
|
|
"useFastMode": true,
|
|
},
|
|
quality: 10,
|
|
},
|
|
"fetch": {
|
|
features: {
|
|
"actions": false,
|
|
"waitFor": false,
|
|
"screenshot": false,
|
|
"screenshot@fullScreen": false,
|
|
"pdf": false,
|
|
"docx": false,
|
|
"atsv": false,
|
|
"location": false,
|
|
"mobile": false,
|
|
"skipTlsVerification": false,
|
|
"useFastMode": true,
|
|
},
|
|
quality: 5,
|
|
},
|
|
"pdf": {
|
|
features: {
|
|
"actions": false,
|
|
"waitFor": false,
|
|
"screenshot": false,
|
|
"screenshot@fullScreen": false,
|
|
"pdf": true,
|
|
"docx": false,
|
|
"atsv": false,
|
|
"location": false,
|
|
"mobile": false,
|
|
"skipTlsVerification": false,
|
|
"useFastMode": true,
|
|
},
|
|
quality: -10,
|
|
},
|
|
"docx": {
|
|
features: {
|
|
"actions": false,
|
|
"waitFor": false,
|
|
"screenshot": false,
|
|
"screenshot@fullScreen": false,
|
|
"pdf": false,
|
|
"docx": true,
|
|
"atsv": false,
|
|
"location": false,
|
|
"mobile": false,
|
|
"skipTlsVerification": false,
|
|
"useFastMode": true,
|
|
},
|
|
quality: -10,
|
|
},
|
|
};
|
|
|
|
export function buildFallbackList(meta: Meta): {
|
|
engine: Engine,
|
|
unsupportedFeatures: Set<FeatureFlag>,
|
|
}[] {
|
|
const prioritySum = [...meta.featureFlags].reduce((a, x) => a + featureFlagOptions[x].priority, 0);
|
|
const priorityThreshold = Math.floor(prioritySum / 2);
|
|
let selectedEngines: {
|
|
engine: Engine,
|
|
supportScore: number,
|
|
unsupportedFeatures: Set<FeatureFlag>,
|
|
}[] = [];
|
|
|
|
const currentEngines = meta.internalOptions.forceEngine !== undefined ? [meta.internalOptions.forceEngine] : engines;
|
|
|
|
for (const engine of currentEngines) {
|
|
const supportedFlags = new Set([...Object.entries(engineOptions[engine].features).filter(([k, v]) => meta.featureFlags.has(k as FeatureFlag) && v === true).map(([k, _]) => k)]);
|
|
const supportScore = [...supportedFlags].reduce((a, x) => a + featureFlagOptions[x].priority, 0);
|
|
|
|
const unsupportedFeatures = new Set([...meta.featureFlags]);
|
|
for (const flag of meta.featureFlags) {
|
|
if (supportedFlags.has(flag)) {
|
|
unsupportedFeatures.delete(flag);
|
|
}
|
|
}
|
|
|
|
if (supportScore >= priorityThreshold) {
|
|
selectedEngines.push({ engine, supportScore, unsupportedFeatures });
|
|
meta.logger.debug(`Engine ${engine} meets feature priority threshold`, { supportScore, prioritySum, priorityThreshold, featureFlags: [...meta.featureFlags], unsupportedFeatures });
|
|
} else {
|
|
meta.logger.debug(`Engine ${engine} does not meet feature priority threshold`, { supportScore, prioritySum, priorityThreshold, featureFlags: [...meta.featureFlags], unsupportedFeatures});
|
|
}
|
|
}
|
|
|
|
if (selectedEngines.some(x => engineOptions[x.engine].quality > 0)) {
|
|
selectedEngines = selectedEngines.filter(x => engineOptions[x.engine].quality > 0);
|
|
}
|
|
|
|
selectedEngines.sort((a,b) => b.supportScore - a.supportScore || engineOptions[b.engine].quality - engineOptions[a.engine].quality);
|
|
|
|
return selectedEngines;
|
|
}
|
|
|
|
export async function scrapeURLWithEngine(meta: Meta, engine: Engine): Promise<EngineScrapeResult> {
|
|
const fn = engineHandlers[engine];
|
|
const logger = meta.logger.child({ method: fn.name ?? "scrapeURLWithEngine", engine });
|
|
const _meta = {
|
|
...meta,
|
|
logger,
|
|
};
|
|
|
|
return await fn(_meta);
|
|
}
|