diff --git a/apps/api/src/__tests__/snips/lib.ts b/apps/api/src/__tests__/snips/lib.ts index f87be6d4..8b32c6bc 100644 --- a/apps/api/src/__tests__/snips/lib.ts +++ b/apps/api/src/__tests__/snips/lib.ts @@ -31,6 +31,11 @@ function expectScrapeToSucceed(response: Awaited>) export async function scrape(body: ScrapeRequestInput): Promise { const raw = await scrapeRaw(body); expectScrapeToSucceed(raw); + if (body.proxy === "stealth") { + expect(raw.body.data.metadata.proxyUsed).toBe("stealth"); + } else if (!body.proxy || body.proxy === "basic") { + expect(raw.body.data.metadata.proxyUsed).toBe("basic"); + } return raw.body.data; } diff --git a/apps/api/src/__tests__/snips/scrape.test.ts b/apps/api/src/__tests__/snips/scrape.test.ts index ead78ac0..2bcad148 100644 --- a/apps/api/src/__tests__/snips/scrape.test.ts +++ b/apps/api/src/__tests__/snips/scrape.test.ts @@ -275,6 +275,26 @@ describe("Scrape tests", () => { timeout: 120000, }); }, 130000); + + it.concurrent("auto works properly on non-stealth site", async () => { + const res = await scrape({ + url: "http://firecrawl.dev", + proxy: "auto", + timeout: 120000, + }); + + expect(res.metadata.proxyUsed).toBe("basic"); + }, 130000); + + it.concurrent("auto works properly on 'stealth' site (faked for reliabile testing)", async () => { + const res = await scrape({ + url: "https://httpstat.us/403", + proxy: "auto", + timeout: 120000, + }); + + expect(res.metadata.proxyUsed).toBe("stealth"); + }, 130000); }); // Temporarily disabled, too flaky diff --git a/apps/api/src/controllers/v1/scrape.ts b/apps/api/src/controllers/v1/scrape.ts index d492156f..ea43ead6 100644 --- a/apps/api/src/controllers/v1/scrape.ts +++ b/apps/api/src/controllers/v1/scrape.ts @@ -140,6 +140,7 @@ export async function scrapeController( if ((req.body.extract && req.body.formats?.includes("extract")) || (req.body.formats?.includes("changeTracking") && req.body.changeTrackingOptions?.modes?.includes("json"))) { creditsToBeBilled = 5; } + if (req.body.agent?.model?.toLowerCase() === "fire-1" || req.body.extract?.agent?.model?.toLowerCase() === "fire-1" || req.body.jsonOptions?.agent?.model?.toLowerCase() === "fire-1") { if (process.env.USE_DB_AUTHENTICATION === "true") { // @Nick this is a hack pushed at 2AM pls help - mogery @@ -155,7 +156,7 @@ export async function scrapeController( } } - if (req.body.proxy === "stealth") { + if (doc?.metadata?.proxyUsed === "stealth") { creditsToBeBilled += 4; } diff --git a/apps/api/src/controllers/v1/search.ts b/apps/api/src/controllers/v1/search.ts index 5e90f272..7cb2f573 100644 --- a/apps/api/src/controllers/v1/search.ts +++ b/apps/api/src/controllers/v1/search.ts @@ -145,6 +145,7 @@ async function scrapeSearchResult( metadata: { statusCode, error: error.message, + proxyUsed: "basic", }, }; } diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 1a39037a..02911125 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -308,7 +308,7 @@ const baseScrapeOptions = z fastMode: z.boolean().default(false), useMock: z.string().optional(), blockAds: z.boolean().default(true), - proxy: z.enum(["basic", "stealth"]).optional(), + proxy: z.enum(["basic", "stealth", "auto"]).optional(), }) .strict(strictMessage); @@ -360,7 +360,7 @@ const extractTransform = (obj) => { obj = { ...obj, timeout: 300000 }; } - if (obj.proxy === "stealth" && obj.timeout === 30000) { + if ((obj.proxy === "stealth" || obj.proxy === "auto") && obj.timeout === 30000) { obj = { ...obj, timeout: 120000 }; } @@ -748,6 +748,7 @@ export type Document = { statusCode: number; scrapeId?: string; error?: string; + proxyUsed: "basic" | "stealth"; // [key: string]: string | string[] | number | { smartScrape: number; other: number; total: number } | undefined; }; serpResults?: { diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts index 37d3c870..1463104f 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts @@ -223,7 +223,7 @@ export async function scrapeURLWithFireEngineChromeCDP( timeout, // TODO: better timeout logic disableSmartWaitCache: meta.internalOptions.disableSmartWaitCache, blockAds: meta.options.blockAds, - mobileProxy: meta.options.proxy === undefined ? undefined : meta.options.proxy === "stealth" ? true : false, + mobileProxy: meta.featureFlags.has("stealthProxy"), saveScrapeResultToGCS: meta.internalOptions.saveScrapeResultToGCS, // TODO: scrollXPaths }; @@ -304,7 +304,7 @@ export async function scrapeURLWithFireEnginePlaywright( wait: meta.options.waitFor, geolocation: meta.options.geolocation ?? meta.options.location, blockAds: meta.options.blockAds, - mobileProxy: meta.options.proxy === undefined ? undefined : meta.options.proxy === "stealth" ? true : false, + mobileProxy: meta.featureFlags.has("stealthProxy"), timeout, }; @@ -360,7 +360,7 @@ export async function scrapeURLWithFireEngineTLSClient( atsv: meta.internalOptions.atsv, geolocation: meta.options.geolocation ?? meta.options.location, disableJsDom: meta.internalOptions.v0DisableJsDom, - mobileProxy: meta.options.proxy === undefined ? undefined : meta.options.proxy === "stealth" ? true : false, + mobileProxy: meta.featureFlags.has("stealthProxy"), timeout, }; diff --git a/apps/api/src/scraper/scrapeURL/engines/index.ts b/apps/api/src/scraper/scrapeURL/engines/index.ts index 6a2a2e40..08fb4a3b 100644 --- a/apps/api/src/scraper/scrapeURL/engines/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/index.ts @@ -14,8 +14,12 @@ import { scrapeCache } from "./cache"; export type Engine = | "fire-engine;chrome-cdp" | "fire-engine(retry);chrome-cdp" + | "fire-engine;chrome-cdp;stealth" + | "fire-engine(retry);chrome-cdp;stealth" | "fire-engine;playwright" + | "fire-engine;playwright;stealth" | "fire-engine;tlsclient" + | "fire-engine;tlsclient;stealth" | "playwright" | "fetch" | "pdf" @@ -37,9 +41,13 @@ export const engines: Engine[] = [ ...(useFireEngine ? [ "fire-engine;chrome-cdp" as const, + "fire-engine;chrome-cdp;stealth" as const, "fire-engine(retry);chrome-cdp" as const, + "fire-engine(retry);chrome-cdp;stealth" as const, "fire-engine;playwright" as const, + "fire-engine;playwright;stealth" as const, "fire-engine;tlsclient" as const, + "fire-engine;tlsclient;stealth" as const, ] : []), ...(usePlaywright ? ["playwright" as const] : []), @@ -112,8 +120,12 @@ const engineHandlers: { cache: scrapeCache, "fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP, "fire-engine(retry);chrome-cdp": scrapeURLWithFireEngineChromeCDP, + "fire-engine;chrome-cdp;stealth": scrapeURLWithFireEngineChromeCDP, + "fire-engine(retry);chrome-cdp;stealth": scrapeURLWithFireEngineChromeCDP, "fire-engine;playwright": scrapeURLWithFireEnginePlaywright, + "fire-engine;playwright;stealth": scrapeURLWithFireEnginePlaywright, "fire-engine;tlsclient": scrapeURLWithFireEngineTLSClient, + "fire-engine;tlsclient;stealth": scrapeURLWithFireEngineTLSClient, playwright: scrapeURLWithPlaywright, fetch: scrapeURLWithFetch, pdf: scrapePDF, @@ -126,7 +138,7 @@ export const engineOptions: { features: { [F in FeatureFlag]: boolean }; // This defines the order of engines in general. The engine with the highest quality will be used the most. - // Negative quality numbers are reserved for specialty engines, e.g. PDF and DOCX + // Negative quality numbers are reserved for specialty engines, e.g. PDF, DOCX, stealth proxies quality: number; }; } = { @@ -160,7 +172,7 @@ export const engineOptions: { mobile: true, skipTlsVerification: true, useFastMode: false, - stealthProxy: true, + stealthProxy: false, }, quality: 50, }, @@ -177,10 +189,44 @@ export const engineOptions: { mobile: true, skipTlsVerification: true, useFastMode: false, - stealthProxy: true, + stealthProxy: false, }, quality: 45, }, + "fire-engine;chrome-cdp;stealth": { + features: { + actions: true, + waitFor: true, // through actions transform + screenshot: true, // through actions transform + "screenshot@fullScreen": true, // through actions transform + pdf: false, + docx: false, + atsv: false, + location: true, + mobile: true, + skipTlsVerification: true, + useFastMode: false, + stealthProxy: true, + }, + quality: -1, + }, + "fire-engine(retry);chrome-cdp;stealth": { + features: { + actions: true, + waitFor: true, // through actions transform + screenshot: true, // through actions transform + "screenshot@fullScreen": true, // through actions transform + pdf: false, + docx: false, + atsv: false, + location: true, + mobile: true, + skipTlsVerification: true, + useFastMode: false, + stealthProxy: true, + }, + quality: -5, + }, "fire-engine;playwright": { features: { actions: false, @@ -194,10 +240,27 @@ export const engineOptions: { mobile: false, skipTlsVerification: false, useFastMode: false, - stealthProxy: true, + stealthProxy: false, }, quality: 40, }, + "fire-engine;playwright;stealth": { + features: { + actions: false, + waitFor: true, + screenshot: true, + "screenshot@fullScreen": true, + pdf: false, + docx: false, + atsv: false, + location: false, + mobile: false, + skipTlsVerification: false, + useFastMode: false, + stealthProxy: true, + }, + quality: -10, + }, playwright: { features: { actions: false, @@ -228,10 +291,27 @@ export const engineOptions: { mobile: false, skipTlsVerification: false, useFastMode: true, - stealthProxy: true, + stealthProxy: false, }, quality: 10, }, + "fire-engine;tlsclient;stealth": { + features: { + actions: false, + waitFor: false, + screenshot: false, + "screenshot@fullScreen": false, + pdf: false, + docx: false, + atsv: true, + location: true, + mobile: false, + skipTlsVerification: false, + useFastMode: true, + stealthProxy: true, + }, + quality: -15, + }, fetch: { features: { actions: false, @@ -264,7 +344,7 @@ export const engineOptions: { useFastMode: true, stealthProxy: true, // kinda... }, - quality: -10, + quality: -20, }, docx: { features: { @@ -281,7 +361,7 @@ export const engineOptions: { useFastMode: true, stealthProxy: true, // kinda... }, - quality: -10, + quality: -20, }, }; @@ -293,7 +373,7 @@ export function buildFallbackList(meta: Meta): { ...engines, // enable fire-engine in self-hosted testing environment when mocks are supplied - ...((!useFireEngine && meta.mock !== null) ? ["fire-engine;chrome-cdp", "fire-engine(retry);chrome-cdp", "fire-engine;playwright", "fire-engine;tlsclient"] as Engine[] : []) + ...((!useFireEngine && meta.mock !== null) ? ["fire-engine;chrome-cdp", "fire-engine(retry);chrome-cdp", "fire-engine;chrome-cdp;stealth", "fire-engine(retry);chrome-cdp;stealth", "fire-engine;playwright", "fire-engine;tlsclient", "fire-engine;playwright;stealth", "fire-engine;tlsclient;stealth"] as Engine[] : []) ]; if (meta.internalOptions.useCache !== true) { diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index baf29752..d5d5a680 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -261,16 +261,22 @@ async function scrapeURLLoop(meta: Meta): Promise { (engineResult.statusCode >= 200 && engineResult.statusCode < 300) || engineResult.statusCode === 304; const hasNoPageError = engineResult.error === undefined; + const isLikelyProxyError = [403, 429].includes(engineResult.statusCode); results[engine] = { state: "success", result: engineResult, - factors: { isLongEnough, isGoodStatusCode, hasNoPageError }, + factors: { isLongEnough, isGoodStatusCode, hasNoPageError, isLikelyProxyError }, unsupportedFeatures, startedAt, finishedAt: Date.now(), }; + if (isLikelyProxyError && meta.options.proxy === "auto" && !meta.featureFlags.has("stealthProxy")) { + meta.logger.info("Scrape via " + engine + " deemed unsuccessful due to proxy inadequacy. Adding stealthProxy flag."); + throw new AddFeatureError(["stealthProxy"]); + } + // NOTE: TODO: what to do when status code is bad is tough... // we cannot just rely on text because error messages can be brief and not hit the limit // should we just use all the fallbacks and pick the one with the longest text? - mogery @@ -368,6 +374,7 @@ async function scrapeURLLoop(meta: Meta): Promise { url: result.result.url, statusCode: result.result.statusCode, error: result.result.error, + proxyUsed: meta.featureFlags.has("stealthProxy") ? "stealth" : "basic", }, }; diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 72efb5d8..d9da9ea1 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -1384,7 +1384,7 @@ async function processJob(job: Job & { id: string }, token: string) { } } - if (job.data.scrapeOptions.proxy === "stealth") { + if (doc.metadata?.proxyUsed === "stealth") { creditsToBeBilled += 4; }