mirror of
https://github.com/mendableai/firecrawl.git
synced 2025-09-25 16:29:43 +00:00
Merge branch 'main' of https://github.com/mendableai/firecrawl
This commit is contained in:
commit
e26a0a65a7
14
apps/api/pnpm-lock.yaml
generated
14
apps/api/pnpm-lock.yaml
generated
@ -7478,7 +7478,7 @@ snapshots:
|
||||
|
||||
extract-zip@2.0.1:
|
||||
dependencies:
|
||||
debug: 4.3.4
|
||||
debug: 4.3.5
|
||||
get-stream: 5.2.0
|
||||
yauzl: 2.10.0
|
||||
optionalDependencies:
|
||||
@ -7622,7 +7622,7 @@ snapshots:
|
||||
dependencies:
|
||||
basic-ftp: 5.0.5
|
||||
data-uri-to-buffer: 6.0.2
|
||||
debug: 4.3.4
|
||||
debug: 4.3.5
|
||||
fs-extra: 11.2.0
|
||||
transitivePeerDependencies:
|
||||
- supports-color
|
||||
@ -7723,7 +7723,7 @@ snapshots:
|
||||
http-proxy-agent@7.0.2:
|
||||
dependencies:
|
||||
agent-base: 7.1.1
|
||||
debug: 4.3.4
|
||||
debug: 4.3.5
|
||||
transitivePeerDependencies:
|
||||
- supports-color
|
||||
|
||||
@ -7771,7 +7771,7 @@ snapshots:
|
||||
https-proxy-agent@7.0.5:
|
||||
dependencies:
|
||||
agent-base: 7.1.1
|
||||
debug: 4.3.4
|
||||
debug: 4.3.5
|
||||
transitivePeerDependencies:
|
||||
- supports-color
|
||||
|
||||
@ -8836,7 +8836,7 @@ snapshots:
|
||||
dependencies:
|
||||
'@tootallnate/quickjs-emscripten': 0.23.0
|
||||
agent-base: 7.1.1
|
||||
debug: 4.3.4
|
||||
debug: 4.3.5
|
||||
get-uri: 6.0.3
|
||||
http-proxy-agent: 7.0.2
|
||||
https-proxy-agent: 7.0.5
|
||||
@ -9031,7 +9031,7 @@ snapshots:
|
||||
proxy-agent@6.4.0:
|
||||
dependencies:
|
||||
agent-base: 7.1.1
|
||||
debug: 4.3.4
|
||||
debug: 4.3.5
|
||||
http-proxy-agent: 7.0.2
|
||||
https-proxy-agent: 7.0.5
|
||||
lru-cache: 7.18.3
|
||||
@ -9338,7 +9338,7 @@ snapshots:
|
||||
socks-proxy-agent@8.0.4:
|
||||
dependencies:
|
||||
agent-base: 7.1.1
|
||||
debug: 4.3.4
|
||||
debug: 4.3.5
|
||||
socks: 2.8.3
|
||||
transitivePeerDependencies:
|
||||
- supports-color
|
||||
|
@ -60,7 +60,7 @@ export async function scrapeController(
|
||||
try {
|
||||
doc = await waitForJob<Document>(jobId, timeout + totalWait); // TODO: better types for this
|
||||
} catch (e) {
|
||||
logger.error(`Error in scrapeController: ${e}`);
|
||||
logger.error(`Error in scrapeController: ${e}`, { jobId, scrapeId: jobId, startTime });
|
||||
if (
|
||||
e instanceof Error &&
|
||||
(e.message.startsWith("Job wait") || e.message === "timeout")
|
||||
|
@ -96,7 +96,6 @@ export async function runWebScraper({
|
||||
...internalOptions,
|
||||
});
|
||||
if (!response.success) {
|
||||
error = response.error;
|
||||
if (response.error instanceof Error) {
|
||||
throw response.error;
|
||||
} else {
|
||||
@ -124,7 +123,8 @@ export async function runWebScraper({
|
||||
// status code is good -- do not attempt retry
|
||||
break;
|
||||
}
|
||||
} catch (error) {
|
||||
} catch (_error) {
|
||||
error = _error;
|
||||
engines =
|
||||
response !== undefined
|
||||
? response.engines
|
||||
|
@ -123,8 +123,13 @@ export async function scrapeURLWithFireEngineChromeCDP(
|
||||
// Include specified actions
|
||||
...(meta.options.actions ?? []),
|
||||
];
|
||||
|
||||
const totalWait = actions.reduce(
|
||||
(a, x) => (x.type === "wait" ? (x.milliseconds ?? 1000) + a : a),
|
||||
0,
|
||||
);
|
||||
|
||||
const timeout = timeToRun ?? 300000;
|
||||
const timeout = (timeToRun ?? 300000) + totalWait;
|
||||
|
||||
const request: FireEngineScrapeRequestCommon &
|
||||
FireEngineScrapeRequestChromeCDP = {
|
||||
@ -146,18 +151,13 @@ export async function scrapeURLWithFireEngineChromeCDP(
|
||||
// TODO: scrollXPaths
|
||||
};
|
||||
|
||||
const totalWait = actions.reduce(
|
||||
(a, x) => (x.type === "wait" ? (x.milliseconds ?? 1000) + a : a),
|
||||
0,
|
||||
);
|
||||
|
||||
let response = await performFireEngineScrape(
|
||||
meta.logger.child({
|
||||
method: "scrapeURLWithFireEngineChromeCDP/callFireEngine",
|
||||
request,
|
||||
}),
|
||||
request,
|
||||
timeout + totalWait,
|
||||
timeout,
|
||||
);
|
||||
|
||||
specialtyScrapeCheck(
|
||||
@ -213,7 +213,8 @@ export async function scrapeURLWithFireEnginePlaywright(
|
||||
meta: Meta,
|
||||
timeToRun: number | undefined,
|
||||
): Promise<EngineScrapeResult> {
|
||||
const timeout = timeToRun ?? 300000;
|
||||
const totalWait = meta.options.waitFor;
|
||||
const timeout = (timeToRun ?? 300000) + totalWait;
|
||||
|
||||
const request: FireEngineScrapeRequestCommon &
|
||||
FireEngineScrapeRequestPlaywright = {
|
||||
@ -237,7 +238,7 @@ export async function scrapeURLWithFireEnginePlaywright(
|
||||
request,
|
||||
}),
|
||||
request,
|
||||
timeout + meta.options.waitFor,
|
||||
timeout,
|
||||
);
|
||||
|
||||
specialtyScrapeCheck(
|
||||
|
@ -140,9 +140,23 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom
|
||||
const { response, tempFilePath } = await downloadFile(meta.id, meta.url);
|
||||
|
||||
let result: PDFProcessorResult | null = null;
|
||||
if (process.env.LLAMAPARSE_API_KEY) {
|
||||
|
||||
// First, try parsing with PdfParse
|
||||
result = await scrapePDFWithParsePDF(
|
||||
{
|
||||
...meta,
|
||||
logger: meta.logger.child({
|
||||
method: "scrapePDF/scrapePDFWithParsePDF",
|
||||
}),
|
||||
},
|
||||
tempFilePath,
|
||||
);
|
||||
|
||||
|
||||
// If the parsed text is under 500 characters and LLAMAPARSE_API_KEY exists, try LlamaParse
|
||||
if (result.markdown && result.markdown.length < 500 && process.env.LLAMAPARSE_API_KEY) {
|
||||
try {
|
||||
result = await scrapePDFWithLlamaParse(
|
||||
const llamaResult = await scrapePDFWithLlamaParse(
|
||||
{
|
||||
...meta,
|
||||
logger: meta.logger.child({
|
||||
@ -152,16 +166,17 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom
|
||||
tempFilePath,
|
||||
timeToRun,
|
||||
);
|
||||
result = llamaResult; // Use LlamaParse result if successful
|
||||
} catch (error) {
|
||||
if (error instanceof Error && error.message === "LlamaParse timed out") {
|
||||
meta.logger.warn("LlamaParse timed out -- falling back to parse-pdf", {
|
||||
meta.logger.warn("LlamaParse timed out -- using parse-pdf result", {
|
||||
error,
|
||||
});
|
||||
} else if (error instanceof RemoveFeatureError) {
|
||||
throw error;
|
||||
} else {
|
||||
meta.logger.warn(
|
||||
"LlamaParse failed to parse PDF -- falling back to parse-pdf",
|
||||
"LlamaParse failed to parse PDF -- using parse-pdf result",
|
||||
{ error },
|
||||
);
|
||||
Sentry.captureException(error);
|
||||
@ -169,18 +184,6 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom
|
||||
}
|
||||
}
|
||||
|
||||
if (result === null) {
|
||||
result = await scrapePDFWithParsePDF(
|
||||
{
|
||||
...meta,
|
||||
logger: meta.logger.child({
|
||||
method: "scrapePDF/scrapePDFWithParsePDF",
|
||||
}),
|
||||
},
|
||||
tempFilePath,
|
||||
);
|
||||
}
|
||||
|
||||
await fs.unlink(tempFilePath);
|
||||
|
||||
return {
|
||||
@ -190,4 +193,4 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom
|
||||
html: result.html,
|
||||
markdown: result.markdown,
|
||||
};
|
||||
}
|
||||
}
|
@ -204,7 +204,7 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
|
||||
let result: EngineScrapeResultWithContext | null = null;
|
||||
|
||||
const timeToRun = meta.options.timeout !== undefined
|
||||
? Math.round(meta.options.timeout / Math.min(fallbackList.length, 3))
|
||||
? Math.round(meta.options.timeout / Math.min(fallbackList.length, 2))
|
||||
: undefined
|
||||
|
||||
for (const { engine, unsupportedFeatures } of fallbackList) {
|
||||
|
@ -3,6 +3,7 @@
|
||||
"rootDir": "./src",
|
||||
"lib": ["ES2022", "DOM"],
|
||||
|
||||
|
||||
// or higher
|
||||
"target": "ES2022",
|
||||
|
||||
@ -18,7 +19,7 @@
|
||||
"*": ["node_modules/*", "src/types/*"],
|
||||
},
|
||||
|
||||
"inlineSources": true
|
||||
"inlineSources": true,
|
||||
},
|
||||
"include": ["src/","src/**/*", "services/db/supabase.ts", "utils/utils.ts", "services/db/supabaseEmbeddings.ts", "utils/EventEmmitter.ts", "src/services/queue-service.ts"]
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user