mirror of
https://github.com/mendableai/firecrawl.git
synced 2025-09-27 17:29:20 +00:00
Merge branch 'main' of https://github.com/mendableai/firecrawl
This commit is contained in:
commit
e26a0a65a7
14
apps/api/pnpm-lock.yaml
generated
14
apps/api/pnpm-lock.yaml
generated
@ -7478,7 +7478,7 @@ snapshots:
|
|||||||
|
|
||||||
extract-zip@2.0.1:
|
extract-zip@2.0.1:
|
||||||
dependencies:
|
dependencies:
|
||||||
debug: 4.3.4
|
debug: 4.3.5
|
||||||
get-stream: 5.2.0
|
get-stream: 5.2.0
|
||||||
yauzl: 2.10.0
|
yauzl: 2.10.0
|
||||||
optionalDependencies:
|
optionalDependencies:
|
||||||
@ -7622,7 +7622,7 @@ snapshots:
|
|||||||
dependencies:
|
dependencies:
|
||||||
basic-ftp: 5.0.5
|
basic-ftp: 5.0.5
|
||||||
data-uri-to-buffer: 6.0.2
|
data-uri-to-buffer: 6.0.2
|
||||||
debug: 4.3.4
|
debug: 4.3.5
|
||||||
fs-extra: 11.2.0
|
fs-extra: 11.2.0
|
||||||
transitivePeerDependencies:
|
transitivePeerDependencies:
|
||||||
- supports-color
|
- supports-color
|
||||||
@ -7723,7 +7723,7 @@ snapshots:
|
|||||||
http-proxy-agent@7.0.2:
|
http-proxy-agent@7.0.2:
|
||||||
dependencies:
|
dependencies:
|
||||||
agent-base: 7.1.1
|
agent-base: 7.1.1
|
||||||
debug: 4.3.4
|
debug: 4.3.5
|
||||||
transitivePeerDependencies:
|
transitivePeerDependencies:
|
||||||
- supports-color
|
- supports-color
|
||||||
|
|
||||||
@ -7771,7 +7771,7 @@ snapshots:
|
|||||||
https-proxy-agent@7.0.5:
|
https-proxy-agent@7.0.5:
|
||||||
dependencies:
|
dependencies:
|
||||||
agent-base: 7.1.1
|
agent-base: 7.1.1
|
||||||
debug: 4.3.4
|
debug: 4.3.5
|
||||||
transitivePeerDependencies:
|
transitivePeerDependencies:
|
||||||
- supports-color
|
- supports-color
|
||||||
|
|
||||||
@ -8836,7 +8836,7 @@ snapshots:
|
|||||||
dependencies:
|
dependencies:
|
||||||
'@tootallnate/quickjs-emscripten': 0.23.0
|
'@tootallnate/quickjs-emscripten': 0.23.0
|
||||||
agent-base: 7.1.1
|
agent-base: 7.1.1
|
||||||
debug: 4.3.4
|
debug: 4.3.5
|
||||||
get-uri: 6.0.3
|
get-uri: 6.0.3
|
||||||
http-proxy-agent: 7.0.2
|
http-proxy-agent: 7.0.2
|
||||||
https-proxy-agent: 7.0.5
|
https-proxy-agent: 7.0.5
|
||||||
@ -9031,7 +9031,7 @@ snapshots:
|
|||||||
proxy-agent@6.4.0:
|
proxy-agent@6.4.0:
|
||||||
dependencies:
|
dependencies:
|
||||||
agent-base: 7.1.1
|
agent-base: 7.1.1
|
||||||
debug: 4.3.4
|
debug: 4.3.5
|
||||||
http-proxy-agent: 7.0.2
|
http-proxy-agent: 7.0.2
|
||||||
https-proxy-agent: 7.0.5
|
https-proxy-agent: 7.0.5
|
||||||
lru-cache: 7.18.3
|
lru-cache: 7.18.3
|
||||||
@ -9338,7 +9338,7 @@ snapshots:
|
|||||||
socks-proxy-agent@8.0.4:
|
socks-proxy-agent@8.0.4:
|
||||||
dependencies:
|
dependencies:
|
||||||
agent-base: 7.1.1
|
agent-base: 7.1.1
|
||||||
debug: 4.3.4
|
debug: 4.3.5
|
||||||
socks: 2.8.3
|
socks: 2.8.3
|
||||||
transitivePeerDependencies:
|
transitivePeerDependencies:
|
||||||
- supports-color
|
- supports-color
|
||||||
|
@ -60,7 +60,7 @@ export async function scrapeController(
|
|||||||
try {
|
try {
|
||||||
doc = await waitForJob<Document>(jobId, timeout + totalWait); // TODO: better types for this
|
doc = await waitForJob<Document>(jobId, timeout + totalWait); // TODO: better types for this
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
logger.error(`Error in scrapeController: ${e}`);
|
logger.error(`Error in scrapeController: ${e}`, { jobId, scrapeId: jobId, startTime });
|
||||||
if (
|
if (
|
||||||
e instanceof Error &&
|
e instanceof Error &&
|
||||||
(e.message.startsWith("Job wait") || e.message === "timeout")
|
(e.message.startsWith("Job wait") || e.message === "timeout")
|
||||||
|
@ -96,7 +96,6 @@ export async function runWebScraper({
|
|||||||
...internalOptions,
|
...internalOptions,
|
||||||
});
|
});
|
||||||
if (!response.success) {
|
if (!response.success) {
|
||||||
error = response.error;
|
|
||||||
if (response.error instanceof Error) {
|
if (response.error instanceof Error) {
|
||||||
throw response.error;
|
throw response.error;
|
||||||
} else {
|
} else {
|
||||||
@ -124,7 +123,8 @@ export async function runWebScraper({
|
|||||||
// status code is good -- do not attempt retry
|
// status code is good -- do not attempt retry
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (_error) {
|
||||||
|
error = _error;
|
||||||
engines =
|
engines =
|
||||||
response !== undefined
|
response !== undefined
|
||||||
? response.engines
|
? response.engines
|
||||||
|
@ -124,7 +124,12 @@ export async function scrapeURLWithFireEngineChromeCDP(
|
|||||||
...(meta.options.actions ?? []),
|
...(meta.options.actions ?? []),
|
||||||
];
|
];
|
||||||
|
|
||||||
const timeout = timeToRun ?? 300000;
|
const totalWait = actions.reduce(
|
||||||
|
(a, x) => (x.type === "wait" ? (x.milliseconds ?? 1000) + a : a),
|
||||||
|
0,
|
||||||
|
);
|
||||||
|
|
||||||
|
const timeout = (timeToRun ?? 300000) + totalWait;
|
||||||
|
|
||||||
const request: FireEngineScrapeRequestCommon &
|
const request: FireEngineScrapeRequestCommon &
|
||||||
FireEngineScrapeRequestChromeCDP = {
|
FireEngineScrapeRequestChromeCDP = {
|
||||||
@ -146,18 +151,13 @@ export async function scrapeURLWithFireEngineChromeCDP(
|
|||||||
// TODO: scrollXPaths
|
// TODO: scrollXPaths
|
||||||
};
|
};
|
||||||
|
|
||||||
const totalWait = actions.reduce(
|
|
||||||
(a, x) => (x.type === "wait" ? (x.milliseconds ?? 1000) + a : a),
|
|
||||||
0,
|
|
||||||
);
|
|
||||||
|
|
||||||
let response = await performFireEngineScrape(
|
let response = await performFireEngineScrape(
|
||||||
meta.logger.child({
|
meta.logger.child({
|
||||||
method: "scrapeURLWithFireEngineChromeCDP/callFireEngine",
|
method: "scrapeURLWithFireEngineChromeCDP/callFireEngine",
|
||||||
request,
|
request,
|
||||||
}),
|
}),
|
||||||
request,
|
request,
|
||||||
timeout + totalWait,
|
timeout,
|
||||||
);
|
);
|
||||||
|
|
||||||
specialtyScrapeCheck(
|
specialtyScrapeCheck(
|
||||||
@ -213,7 +213,8 @@ export async function scrapeURLWithFireEnginePlaywright(
|
|||||||
meta: Meta,
|
meta: Meta,
|
||||||
timeToRun: number | undefined,
|
timeToRun: number | undefined,
|
||||||
): Promise<EngineScrapeResult> {
|
): Promise<EngineScrapeResult> {
|
||||||
const timeout = timeToRun ?? 300000;
|
const totalWait = meta.options.waitFor;
|
||||||
|
const timeout = (timeToRun ?? 300000) + totalWait;
|
||||||
|
|
||||||
const request: FireEngineScrapeRequestCommon &
|
const request: FireEngineScrapeRequestCommon &
|
||||||
FireEngineScrapeRequestPlaywright = {
|
FireEngineScrapeRequestPlaywright = {
|
||||||
@ -237,7 +238,7 @@ export async function scrapeURLWithFireEnginePlaywright(
|
|||||||
request,
|
request,
|
||||||
}),
|
}),
|
||||||
request,
|
request,
|
||||||
timeout + meta.options.waitFor,
|
timeout,
|
||||||
);
|
);
|
||||||
|
|
||||||
specialtyScrapeCheck(
|
specialtyScrapeCheck(
|
||||||
|
@ -140,9 +140,23 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom
|
|||||||
const { response, tempFilePath } = await downloadFile(meta.id, meta.url);
|
const { response, tempFilePath } = await downloadFile(meta.id, meta.url);
|
||||||
|
|
||||||
let result: PDFProcessorResult | null = null;
|
let result: PDFProcessorResult | null = null;
|
||||||
if (process.env.LLAMAPARSE_API_KEY) {
|
|
||||||
|
// First, try parsing with PdfParse
|
||||||
|
result = await scrapePDFWithParsePDF(
|
||||||
|
{
|
||||||
|
...meta,
|
||||||
|
logger: meta.logger.child({
|
||||||
|
method: "scrapePDF/scrapePDFWithParsePDF",
|
||||||
|
}),
|
||||||
|
},
|
||||||
|
tempFilePath,
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
// If the parsed text is under 500 characters and LLAMAPARSE_API_KEY exists, try LlamaParse
|
||||||
|
if (result.markdown && result.markdown.length < 500 && process.env.LLAMAPARSE_API_KEY) {
|
||||||
try {
|
try {
|
||||||
result = await scrapePDFWithLlamaParse(
|
const llamaResult = await scrapePDFWithLlamaParse(
|
||||||
{
|
{
|
||||||
...meta,
|
...meta,
|
||||||
logger: meta.logger.child({
|
logger: meta.logger.child({
|
||||||
@ -152,16 +166,17 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom
|
|||||||
tempFilePath,
|
tempFilePath,
|
||||||
timeToRun,
|
timeToRun,
|
||||||
);
|
);
|
||||||
|
result = llamaResult; // Use LlamaParse result if successful
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (error instanceof Error && error.message === "LlamaParse timed out") {
|
if (error instanceof Error && error.message === "LlamaParse timed out") {
|
||||||
meta.logger.warn("LlamaParse timed out -- falling back to parse-pdf", {
|
meta.logger.warn("LlamaParse timed out -- using parse-pdf result", {
|
||||||
error,
|
error,
|
||||||
});
|
});
|
||||||
} else if (error instanceof RemoveFeatureError) {
|
} else if (error instanceof RemoveFeatureError) {
|
||||||
throw error;
|
throw error;
|
||||||
} else {
|
} else {
|
||||||
meta.logger.warn(
|
meta.logger.warn(
|
||||||
"LlamaParse failed to parse PDF -- falling back to parse-pdf",
|
"LlamaParse failed to parse PDF -- using parse-pdf result",
|
||||||
{ error },
|
{ error },
|
||||||
);
|
);
|
||||||
Sentry.captureException(error);
|
Sentry.captureException(error);
|
||||||
@ -169,18 +184,6 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (result === null) {
|
|
||||||
result = await scrapePDFWithParsePDF(
|
|
||||||
{
|
|
||||||
...meta,
|
|
||||||
logger: meta.logger.child({
|
|
||||||
method: "scrapePDF/scrapePDFWithParsePDF",
|
|
||||||
}),
|
|
||||||
},
|
|
||||||
tempFilePath,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
await fs.unlink(tempFilePath);
|
await fs.unlink(tempFilePath);
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
@ -204,7 +204,7 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
|
|||||||
let result: EngineScrapeResultWithContext | null = null;
|
let result: EngineScrapeResultWithContext | null = null;
|
||||||
|
|
||||||
const timeToRun = meta.options.timeout !== undefined
|
const timeToRun = meta.options.timeout !== undefined
|
||||||
? Math.round(meta.options.timeout / Math.min(fallbackList.length, 3))
|
? Math.round(meta.options.timeout / Math.min(fallbackList.length, 2))
|
||||||
: undefined
|
: undefined
|
||||||
|
|
||||||
for (const { engine, unsupportedFeatures } of fallbackList) {
|
for (const { engine, unsupportedFeatures } of fallbackList) {
|
||||||
|
@ -3,6 +3,7 @@
|
|||||||
"rootDir": "./src",
|
"rootDir": "./src",
|
||||||
"lib": ["ES2022", "DOM"],
|
"lib": ["ES2022", "DOM"],
|
||||||
|
|
||||||
|
|
||||||
// or higher
|
// or higher
|
||||||
"target": "ES2022",
|
"target": "ES2022",
|
||||||
|
|
||||||
@ -18,7 +19,7 @@
|
|||||||
"*": ["node_modules/*", "src/types/*"],
|
"*": ["node_modules/*", "src/types/*"],
|
||||||
},
|
},
|
||||||
|
|
||||||
"inlineSources": true
|
"inlineSources": true,
|
||||||
},
|
},
|
||||||
"include": ["src/","src/**/*", "services/db/supabase.ts", "utils/utils.ts", "services/db/supabaseEmbeddings.ts", "utils/EventEmmitter.ts", "src/services/queue-service.ts"]
|
"include": ["src/","src/**/*", "services/db/supabase.ts", "utils/utils.ts", "services/db/supabaseEmbeddings.ts", "utils/EventEmmitter.ts", "src/services/queue-service.ts"]
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user