mirror of
https://github.com/mendableai/firecrawl.git
synced 2025-09-25 16:29:43 +00:00
feat(scrapeURL): proxy auto mode (FIR-1853) (#1551)
* feat(scrapeURL): proxy auto mode * feat(api/tests/snips/proxy/auto): add test for stealth pick
This commit is contained in:
parent
8eeb3c5cd4
commit
fab4f00536
@ -31,6 +31,11 @@ function expectScrapeToSucceed(response: Awaited<ReturnType<typeof scrapeRaw>>)
|
|||||||
export async function scrape(body: ScrapeRequestInput): Promise<Document> {
|
export async function scrape(body: ScrapeRequestInput): Promise<Document> {
|
||||||
const raw = await scrapeRaw(body);
|
const raw = await scrapeRaw(body);
|
||||||
expectScrapeToSucceed(raw);
|
expectScrapeToSucceed(raw);
|
||||||
|
if (body.proxy === "stealth") {
|
||||||
|
expect(raw.body.data.metadata.proxyUsed).toBe("stealth");
|
||||||
|
} else if (!body.proxy || body.proxy === "basic") {
|
||||||
|
expect(raw.body.data.metadata.proxyUsed).toBe("basic");
|
||||||
|
}
|
||||||
return raw.body.data;
|
return raw.body.data;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -275,6 +275,26 @@ describe("Scrape tests", () => {
|
|||||||
timeout: 120000,
|
timeout: 120000,
|
||||||
});
|
});
|
||||||
}, 130000);
|
}, 130000);
|
||||||
|
|
||||||
|
it.concurrent("auto works properly on non-stealth site", async () => {
|
||||||
|
const res = await scrape({
|
||||||
|
url: "http://firecrawl.dev",
|
||||||
|
proxy: "auto",
|
||||||
|
timeout: 120000,
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(res.metadata.proxyUsed).toBe("basic");
|
||||||
|
}, 130000);
|
||||||
|
|
||||||
|
it.concurrent("auto works properly on 'stealth' site (faked for reliabile testing)", async () => {
|
||||||
|
const res = await scrape({
|
||||||
|
url: "https://httpstat.us/403",
|
||||||
|
proxy: "auto",
|
||||||
|
timeout: 120000,
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(res.metadata.proxyUsed).toBe("stealth");
|
||||||
|
}, 130000);
|
||||||
});
|
});
|
||||||
|
|
||||||
// Temporarily disabled, too flaky
|
// Temporarily disabled, too flaky
|
||||||
|
@ -140,6 +140,7 @@ export async function scrapeController(
|
|||||||
if ((req.body.extract && req.body.formats?.includes("extract")) || (req.body.formats?.includes("changeTracking") && req.body.changeTrackingOptions?.modes?.includes("json"))) {
|
if ((req.body.extract && req.body.formats?.includes("extract")) || (req.body.formats?.includes("changeTracking") && req.body.changeTrackingOptions?.modes?.includes("json"))) {
|
||||||
creditsToBeBilled = 5;
|
creditsToBeBilled = 5;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (req.body.agent?.model?.toLowerCase() === "fire-1" || req.body.extract?.agent?.model?.toLowerCase() === "fire-1" || req.body.jsonOptions?.agent?.model?.toLowerCase() === "fire-1") {
|
if (req.body.agent?.model?.toLowerCase() === "fire-1" || req.body.extract?.agent?.model?.toLowerCase() === "fire-1" || req.body.jsonOptions?.agent?.model?.toLowerCase() === "fire-1") {
|
||||||
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
||||||
// @Nick this is a hack pushed at 2AM pls help - mogery
|
// @Nick this is a hack pushed at 2AM pls help - mogery
|
||||||
@ -155,7 +156,7 @@ export async function scrapeController(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (req.body.proxy === "stealth") {
|
if (doc?.metadata?.proxyUsed === "stealth") {
|
||||||
creditsToBeBilled += 4;
|
creditsToBeBilled += 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -145,6 +145,7 @@ async function scrapeSearchResult(
|
|||||||
metadata: {
|
metadata: {
|
||||||
statusCode,
|
statusCode,
|
||||||
error: error.message,
|
error: error.message,
|
||||||
|
proxyUsed: "basic",
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -308,7 +308,7 @@ const baseScrapeOptions = z
|
|||||||
fastMode: z.boolean().default(false),
|
fastMode: z.boolean().default(false),
|
||||||
useMock: z.string().optional(),
|
useMock: z.string().optional(),
|
||||||
blockAds: z.boolean().default(true),
|
blockAds: z.boolean().default(true),
|
||||||
proxy: z.enum(["basic", "stealth"]).optional(),
|
proxy: z.enum(["basic", "stealth", "auto"]).optional(),
|
||||||
})
|
})
|
||||||
.strict(strictMessage);
|
.strict(strictMessage);
|
||||||
|
|
||||||
@ -360,7 +360,7 @@ const extractTransform = (obj) => {
|
|||||||
obj = { ...obj, timeout: 300000 };
|
obj = { ...obj, timeout: 300000 };
|
||||||
}
|
}
|
||||||
|
|
||||||
if (obj.proxy === "stealth" && obj.timeout === 30000) {
|
if ((obj.proxy === "stealth" || obj.proxy === "auto") && obj.timeout === 30000) {
|
||||||
obj = { ...obj, timeout: 120000 };
|
obj = { ...obj, timeout: 120000 };
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -748,6 +748,7 @@ export type Document = {
|
|||||||
statusCode: number;
|
statusCode: number;
|
||||||
scrapeId?: string;
|
scrapeId?: string;
|
||||||
error?: string;
|
error?: string;
|
||||||
|
proxyUsed: "basic" | "stealth";
|
||||||
// [key: string]: string | string[] | number | { smartScrape: number; other: number; total: number } | undefined;
|
// [key: string]: string | string[] | number | { smartScrape: number; other: number; total: number } | undefined;
|
||||||
};
|
};
|
||||||
serpResults?: {
|
serpResults?: {
|
||||||
|
@ -223,7 +223,7 @@ export async function scrapeURLWithFireEngineChromeCDP(
|
|||||||
timeout, // TODO: better timeout logic
|
timeout, // TODO: better timeout logic
|
||||||
disableSmartWaitCache: meta.internalOptions.disableSmartWaitCache,
|
disableSmartWaitCache: meta.internalOptions.disableSmartWaitCache,
|
||||||
blockAds: meta.options.blockAds,
|
blockAds: meta.options.blockAds,
|
||||||
mobileProxy: meta.options.proxy === undefined ? undefined : meta.options.proxy === "stealth" ? true : false,
|
mobileProxy: meta.featureFlags.has("stealthProxy"),
|
||||||
saveScrapeResultToGCS: meta.internalOptions.saveScrapeResultToGCS,
|
saveScrapeResultToGCS: meta.internalOptions.saveScrapeResultToGCS,
|
||||||
// TODO: scrollXPaths
|
// TODO: scrollXPaths
|
||||||
};
|
};
|
||||||
@ -304,7 +304,7 @@ export async function scrapeURLWithFireEnginePlaywright(
|
|||||||
wait: meta.options.waitFor,
|
wait: meta.options.waitFor,
|
||||||
geolocation: meta.options.geolocation ?? meta.options.location,
|
geolocation: meta.options.geolocation ?? meta.options.location,
|
||||||
blockAds: meta.options.blockAds,
|
blockAds: meta.options.blockAds,
|
||||||
mobileProxy: meta.options.proxy === undefined ? undefined : meta.options.proxy === "stealth" ? true : false,
|
mobileProxy: meta.featureFlags.has("stealthProxy"),
|
||||||
|
|
||||||
timeout,
|
timeout,
|
||||||
};
|
};
|
||||||
@ -360,7 +360,7 @@ export async function scrapeURLWithFireEngineTLSClient(
|
|||||||
atsv: meta.internalOptions.atsv,
|
atsv: meta.internalOptions.atsv,
|
||||||
geolocation: meta.options.geolocation ?? meta.options.location,
|
geolocation: meta.options.geolocation ?? meta.options.location,
|
||||||
disableJsDom: meta.internalOptions.v0DisableJsDom,
|
disableJsDom: meta.internalOptions.v0DisableJsDom,
|
||||||
mobileProxy: meta.options.proxy === undefined ? undefined : meta.options.proxy === "stealth" ? true : false,
|
mobileProxy: meta.featureFlags.has("stealthProxy"),
|
||||||
|
|
||||||
timeout,
|
timeout,
|
||||||
};
|
};
|
||||||
|
@ -14,8 +14,12 @@ import { scrapeCache } from "./cache";
|
|||||||
export type Engine =
|
export type Engine =
|
||||||
| "fire-engine;chrome-cdp"
|
| "fire-engine;chrome-cdp"
|
||||||
| "fire-engine(retry);chrome-cdp"
|
| "fire-engine(retry);chrome-cdp"
|
||||||
|
| "fire-engine;chrome-cdp;stealth"
|
||||||
|
| "fire-engine(retry);chrome-cdp;stealth"
|
||||||
| "fire-engine;playwright"
|
| "fire-engine;playwright"
|
||||||
|
| "fire-engine;playwright;stealth"
|
||||||
| "fire-engine;tlsclient"
|
| "fire-engine;tlsclient"
|
||||||
|
| "fire-engine;tlsclient;stealth"
|
||||||
| "playwright"
|
| "playwright"
|
||||||
| "fetch"
|
| "fetch"
|
||||||
| "pdf"
|
| "pdf"
|
||||||
@ -37,9 +41,13 @@ export const engines: Engine[] = [
|
|||||||
...(useFireEngine
|
...(useFireEngine
|
||||||
? [
|
? [
|
||||||
"fire-engine;chrome-cdp" as const,
|
"fire-engine;chrome-cdp" as const,
|
||||||
|
"fire-engine;chrome-cdp;stealth" as const,
|
||||||
"fire-engine(retry);chrome-cdp" as const,
|
"fire-engine(retry);chrome-cdp" as const,
|
||||||
|
"fire-engine(retry);chrome-cdp;stealth" as const,
|
||||||
"fire-engine;playwright" as const,
|
"fire-engine;playwright" as const,
|
||||||
|
"fire-engine;playwright;stealth" as const,
|
||||||
"fire-engine;tlsclient" as const,
|
"fire-engine;tlsclient" as const,
|
||||||
|
"fire-engine;tlsclient;stealth" as const,
|
||||||
]
|
]
|
||||||
: []),
|
: []),
|
||||||
...(usePlaywright ? ["playwright" as const] : []),
|
...(usePlaywright ? ["playwright" as const] : []),
|
||||||
@ -112,8 +120,12 @@ const engineHandlers: {
|
|||||||
cache: scrapeCache,
|
cache: scrapeCache,
|
||||||
"fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP,
|
"fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP,
|
||||||
"fire-engine(retry);chrome-cdp": scrapeURLWithFireEngineChromeCDP,
|
"fire-engine(retry);chrome-cdp": scrapeURLWithFireEngineChromeCDP,
|
||||||
|
"fire-engine;chrome-cdp;stealth": scrapeURLWithFireEngineChromeCDP,
|
||||||
|
"fire-engine(retry);chrome-cdp;stealth": scrapeURLWithFireEngineChromeCDP,
|
||||||
"fire-engine;playwright": scrapeURLWithFireEnginePlaywright,
|
"fire-engine;playwright": scrapeURLWithFireEnginePlaywright,
|
||||||
|
"fire-engine;playwright;stealth": scrapeURLWithFireEnginePlaywright,
|
||||||
"fire-engine;tlsclient": scrapeURLWithFireEngineTLSClient,
|
"fire-engine;tlsclient": scrapeURLWithFireEngineTLSClient,
|
||||||
|
"fire-engine;tlsclient;stealth": scrapeURLWithFireEngineTLSClient,
|
||||||
playwright: scrapeURLWithPlaywright,
|
playwright: scrapeURLWithPlaywright,
|
||||||
fetch: scrapeURLWithFetch,
|
fetch: scrapeURLWithFetch,
|
||||||
pdf: scrapePDF,
|
pdf: scrapePDF,
|
||||||
@ -126,7 +138,7 @@ export const engineOptions: {
|
|||||||
features: { [F in FeatureFlag]: boolean };
|
features: { [F in FeatureFlag]: boolean };
|
||||||
|
|
||||||
// This defines the order of engines in general. The engine with the highest quality will be used the most.
|
// This defines the order of engines in general. The engine with the highest quality will be used the most.
|
||||||
// Negative quality numbers are reserved for specialty engines, e.g. PDF and DOCX
|
// Negative quality numbers are reserved for specialty engines, e.g. PDF, DOCX, stealth proxies
|
||||||
quality: number;
|
quality: number;
|
||||||
};
|
};
|
||||||
} = {
|
} = {
|
||||||
@ -160,7 +172,7 @@ export const engineOptions: {
|
|||||||
mobile: true,
|
mobile: true,
|
||||||
skipTlsVerification: true,
|
skipTlsVerification: true,
|
||||||
useFastMode: false,
|
useFastMode: false,
|
||||||
stealthProxy: true,
|
stealthProxy: false,
|
||||||
},
|
},
|
||||||
quality: 50,
|
quality: 50,
|
||||||
},
|
},
|
||||||
@ -177,10 +189,44 @@ export const engineOptions: {
|
|||||||
mobile: true,
|
mobile: true,
|
||||||
skipTlsVerification: true,
|
skipTlsVerification: true,
|
||||||
useFastMode: false,
|
useFastMode: false,
|
||||||
stealthProxy: true,
|
stealthProxy: false,
|
||||||
},
|
},
|
||||||
quality: 45,
|
quality: 45,
|
||||||
},
|
},
|
||||||
|
"fire-engine;chrome-cdp;stealth": {
|
||||||
|
features: {
|
||||||
|
actions: true,
|
||||||
|
waitFor: true, // through actions transform
|
||||||
|
screenshot: true, // through actions transform
|
||||||
|
"screenshot@fullScreen": true, // through actions transform
|
||||||
|
pdf: false,
|
||||||
|
docx: false,
|
||||||
|
atsv: false,
|
||||||
|
location: true,
|
||||||
|
mobile: true,
|
||||||
|
skipTlsVerification: true,
|
||||||
|
useFastMode: false,
|
||||||
|
stealthProxy: true,
|
||||||
|
},
|
||||||
|
quality: -1,
|
||||||
|
},
|
||||||
|
"fire-engine(retry);chrome-cdp;stealth": {
|
||||||
|
features: {
|
||||||
|
actions: true,
|
||||||
|
waitFor: true, // through actions transform
|
||||||
|
screenshot: true, // through actions transform
|
||||||
|
"screenshot@fullScreen": true, // through actions transform
|
||||||
|
pdf: false,
|
||||||
|
docx: false,
|
||||||
|
atsv: false,
|
||||||
|
location: true,
|
||||||
|
mobile: true,
|
||||||
|
skipTlsVerification: true,
|
||||||
|
useFastMode: false,
|
||||||
|
stealthProxy: true,
|
||||||
|
},
|
||||||
|
quality: -5,
|
||||||
|
},
|
||||||
"fire-engine;playwright": {
|
"fire-engine;playwright": {
|
||||||
features: {
|
features: {
|
||||||
actions: false,
|
actions: false,
|
||||||
@ -194,10 +240,27 @@ export const engineOptions: {
|
|||||||
mobile: false,
|
mobile: false,
|
||||||
skipTlsVerification: false,
|
skipTlsVerification: false,
|
||||||
useFastMode: false,
|
useFastMode: false,
|
||||||
stealthProxy: true,
|
stealthProxy: false,
|
||||||
},
|
},
|
||||||
quality: 40,
|
quality: 40,
|
||||||
},
|
},
|
||||||
|
"fire-engine;playwright;stealth": {
|
||||||
|
features: {
|
||||||
|
actions: false,
|
||||||
|
waitFor: true,
|
||||||
|
screenshot: true,
|
||||||
|
"screenshot@fullScreen": true,
|
||||||
|
pdf: false,
|
||||||
|
docx: false,
|
||||||
|
atsv: false,
|
||||||
|
location: false,
|
||||||
|
mobile: false,
|
||||||
|
skipTlsVerification: false,
|
||||||
|
useFastMode: false,
|
||||||
|
stealthProxy: true,
|
||||||
|
},
|
||||||
|
quality: -10,
|
||||||
|
},
|
||||||
playwright: {
|
playwright: {
|
||||||
features: {
|
features: {
|
||||||
actions: false,
|
actions: false,
|
||||||
@ -228,10 +291,27 @@ export const engineOptions: {
|
|||||||
mobile: false,
|
mobile: false,
|
||||||
skipTlsVerification: false,
|
skipTlsVerification: false,
|
||||||
useFastMode: true,
|
useFastMode: true,
|
||||||
stealthProxy: true,
|
stealthProxy: false,
|
||||||
},
|
},
|
||||||
quality: 10,
|
quality: 10,
|
||||||
},
|
},
|
||||||
|
"fire-engine;tlsclient;stealth": {
|
||||||
|
features: {
|
||||||
|
actions: false,
|
||||||
|
waitFor: false,
|
||||||
|
screenshot: false,
|
||||||
|
"screenshot@fullScreen": false,
|
||||||
|
pdf: false,
|
||||||
|
docx: false,
|
||||||
|
atsv: true,
|
||||||
|
location: true,
|
||||||
|
mobile: false,
|
||||||
|
skipTlsVerification: false,
|
||||||
|
useFastMode: true,
|
||||||
|
stealthProxy: true,
|
||||||
|
},
|
||||||
|
quality: -15,
|
||||||
|
},
|
||||||
fetch: {
|
fetch: {
|
||||||
features: {
|
features: {
|
||||||
actions: false,
|
actions: false,
|
||||||
@ -264,7 +344,7 @@ export const engineOptions: {
|
|||||||
useFastMode: true,
|
useFastMode: true,
|
||||||
stealthProxy: true, // kinda...
|
stealthProxy: true, // kinda...
|
||||||
},
|
},
|
||||||
quality: -10,
|
quality: -20,
|
||||||
},
|
},
|
||||||
docx: {
|
docx: {
|
||||||
features: {
|
features: {
|
||||||
@ -281,7 +361,7 @@ export const engineOptions: {
|
|||||||
useFastMode: true,
|
useFastMode: true,
|
||||||
stealthProxy: true, // kinda...
|
stealthProxy: true, // kinda...
|
||||||
},
|
},
|
||||||
quality: -10,
|
quality: -20,
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -293,7 +373,7 @@ export function buildFallbackList(meta: Meta): {
|
|||||||
...engines,
|
...engines,
|
||||||
|
|
||||||
// enable fire-engine in self-hosted testing environment when mocks are supplied
|
// enable fire-engine in self-hosted testing environment when mocks are supplied
|
||||||
...((!useFireEngine && meta.mock !== null) ? ["fire-engine;chrome-cdp", "fire-engine(retry);chrome-cdp", "fire-engine;playwright", "fire-engine;tlsclient"] as Engine[] : [])
|
...((!useFireEngine && meta.mock !== null) ? ["fire-engine;chrome-cdp", "fire-engine(retry);chrome-cdp", "fire-engine;chrome-cdp;stealth", "fire-engine(retry);chrome-cdp;stealth", "fire-engine;playwright", "fire-engine;tlsclient", "fire-engine;playwright;stealth", "fire-engine;tlsclient;stealth"] as Engine[] : [])
|
||||||
];
|
];
|
||||||
|
|
||||||
if (meta.internalOptions.useCache !== true) {
|
if (meta.internalOptions.useCache !== true) {
|
||||||
|
@ -261,16 +261,22 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
|
|||||||
(engineResult.statusCode >= 200 && engineResult.statusCode < 300) ||
|
(engineResult.statusCode >= 200 && engineResult.statusCode < 300) ||
|
||||||
engineResult.statusCode === 304;
|
engineResult.statusCode === 304;
|
||||||
const hasNoPageError = engineResult.error === undefined;
|
const hasNoPageError = engineResult.error === undefined;
|
||||||
|
const isLikelyProxyError = [403, 429].includes(engineResult.statusCode);
|
||||||
|
|
||||||
results[engine] = {
|
results[engine] = {
|
||||||
state: "success",
|
state: "success",
|
||||||
result: engineResult,
|
result: engineResult,
|
||||||
factors: { isLongEnough, isGoodStatusCode, hasNoPageError },
|
factors: { isLongEnough, isGoodStatusCode, hasNoPageError, isLikelyProxyError },
|
||||||
unsupportedFeatures,
|
unsupportedFeatures,
|
||||||
startedAt,
|
startedAt,
|
||||||
finishedAt: Date.now(),
|
finishedAt: Date.now(),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
if (isLikelyProxyError && meta.options.proxy === "auto" && !meta.featureFlags.has("stealthProxy")) {
|
||||||
|
meta.logger.info("Scrape via " + engine + " deemed unsuccessful due to proxy inadequacy. Adding stealthProxy flag.");
|
||||||
|
throw new AddFeatureError(["stealthProxy"]);
|
||||||
|
}
|
||||||
|
|
||||||
// NOTE: TODO: what to do when status code is bad is tough...
|
// NOTE: TODO: what to do when status code is bad is tough...
|
||||||
// we cannot just rely on text because error messages can be brief and not hit the limit
|
// we cannot just rely on text because error messages can be brief and not hit the limit
|
||||||
// should we just use all the fallbacks and pick the one with the longest text? - mogery
|
// should we just use all the fallbacks and pick the one with the longest text? - mogery
|
||||||
@ -368,6 +374,7 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
|
|||||||
url: result.result.url,
|
url: result.result.url,
|
||||||
statusCode: result.result.statusCode,
|
statusCode: result.result.statusCode,
|
||||||
error: result.result.error,
|
error: result.result.error,
|
||||||
|
proxyUsed: meta.featureFlags.has("stealthProxy") ? "stealth" : "basic",
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -1384,7 +1384,7 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (job.data.scrapeOptions.proxy === "stealth") {
|
if (doc.metadata?.proxyUsed === "stealth") {
|
||||||
creditsToBeBilled += 4;
|
creditsToBeBilled += 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user