This commit is contained in:
Nicolas 2024-12-17 15:19:53 -03:00
commit e26a0a65a7
7 changed files with 43 additions and 38 deletions

View File

@ -7478,7 +7478,7 @@ snapshots:
extract-zip@2.0.1: extract-zip@2.0.1:
dependencies: dependencies:
debug: 4.3.4 debug: 4.3.5
get-stream: 5.2.0 get-stream: 5.2.0
yauzl: 2.10.0 yauzl: 2.10.0
optionalDependencies: optionalDependencies:
@ -7622,7 +7622,7 @@ snapshots:
dependencies: dependencies:
basic-ftp: 5.0.5 basic-ftp: 5.0.5
data-uri-to-buffer: 6.0.2 data-uri-to-buffer: 6.0.2
debug: 4.3.4 debug: 4.3.5
fs-extra: 11.2.0 fs-extra: 11.2.0
transitivePeerDependencies: transitivePeerDependencies:
- supports-color - supports-color
@ -7723,7 +7723,7 @@ snapshots:
http-proxy-agent@7.0.2: http-proxy-agent@7.0.2:
dependencies: dependencies:
agent-base: 7.1.1 agent-base: 7.1.1
debug: 4.3.4 debug: 4.3.5
transitivePeerDependencies: transitivePeerDependencies:
- supports-color - supports-color
@ -7771,7 +7771,7 @@ snapshots:
https-proxy-agent@7.0.5: https-proxy-agent@7.0.5:
dependencies: dependencies:
agent-base: 7.1.1 agent-base: 7.1.1
debug: 4.3.4 debug: 4.3.5
transitivePeerDependencies: transitivePeerDependencies:
- supports-color - supports-color
@ -8836,7 +8836,7 @@ snapshots:
dependencies: dependencies:
'@tootallnate/quickjs-emscripten': 0.23.0 '@tootallnate/quickjs-emscripten': 0.23.0
agent-base: 7.1.1 agent-base: 7.1.1
debug: 4.3.4 debug: 4.3.5
get-uri: 6.0.3 get-uri: 6.0.3
http-proxy-agent: 7.0.2 http-proxy-agent: 7.0.2
https-proxy-agent: 7.0.5 https-proxy-agent: 7.0.5
@ -9031,7 +9031,7 @@ snapshots:
proxy-agent@6.4.0: proxy-agent@6.4.0:
dependencies: dependencies:
agent-base: 7.1.1 agent-base: 7.1.1
debug: 4.3.4 debug: 4.3.5
http-proxy-agent: 7.0.2 http-proxy-agent: 7.0.2
https-proxy-agent: 7.0.5 https-proxy-agent: 7.0.5
lru-cache: 7.18.3 lru-cache: 7.18.3
@ -9338,7 +9338,7 @@ snapshots:
socks-proxy-agent@8.0.4: socks-proxy-agent@8.0.4:
dependencies: dependencies:
agent-base: 7.1.1 agent-base: 7.1.1
debug: 4.3.4 debug: 4.3.5
socks: 2.8.3 socks: 2.8.3
transitivePeerDependencies: transitivePeerDependencies:
- supports-color - supports-color

View File

@ -60,7 +60,7 @@ export async function scrapeController(
try { try {
doc = await waitForJob<Document>(jobId, timeout + totalWait); // TODO: better types for this doc = await waitForJob<Document>(jobId, timeout + totalWait); // TODO: better types for this
} catch (e) { } catch (e) {
logger.error(`Error in scrapeController: ${e}`); logger.error(`Error in scrapeController: ${e}`, { jobId, scrapeId: jobId, startTime });
if ( if (
e instanceof Error && e instanceof Error &&
(e.message.startsWith("Job wait") || e.message === "timeout") (e.message.startsWith("Job wait") || e.message === "timeout")

View File

@ -96,7 +96,6 @@ export async function runWebScraper({
...internalOptions, ...internalOptions,
}); });
if (!response.success) { if (!response.success) {
error = response.error;
if (response.error instanceof Error) { if (response.error instanceof Error) {
throw response.error; throw response.error;
} else { } else {
@ -124,7 +123,8 @@ export async function runWebScraper({
// status code is good -- do not attempt retry // status code is good -- do not attempt retry
break; break;
} }
} catch (error) { } catch (_error) {
error = _error;
engines = engines =
response !== undefined response !== undefined
? response.engines ? response.engines

View File

@ -123,8 +123,13 @@ export async function scrapeURLWithFireEngineChromeCDP(
// Include specified actions // Include specified actions
...(meta.options.actions ?? []), ...(meta.options.actions ?? []),
]; ];
const totalWait = actions.reduce(
(a, x) => (x.type === "wait" ? (x.milliseconds ?? 1000) + a : a),
0,
);
const timeout = timeToRun ?? 300000; const timeout = (timeToRun ?? 300000) + totalWait;
const request: FireEngineScrapeRequestCommon & const request: FireEngineScrapeRequestCommon &
FireEngineScrapeRequestChromeCDP = { FireEngineScrapeRequestChromeCDP = {
@ -146,18 +151,13 @@ export async function scrapeURLWithFireEngineChromeCDP(
// TODO: scrollXPaths // TODO: scrollXPaths
}; };
const totalWait = actions.reduce(
(a, x) => (x.type === "wait" ? (x.milliseconds ?? 1000) + a : a),
0,
);
let response = await performFireEngineScrape( let response = await performFireEngineScrape(
meta.logger.child({ meta.logger.child({
method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", method: "scrapeURLWithFireEngineChromeCDP/callFireEngine",
request, request,
}), }),
request, request,
timeout + totalWait, timeout,
); );
specialtyScrapeCheck( specialtyScrapeCheck(
@ -213,7 +213,8 @@ export async function scrapeURLWithFireEnginePlaywright(
meta: Meta, meta: Meta,
timeToRun: number | undefined, timeToRun: number | undefined,
): Promise<EngineScrapeResult> { ): Promise<EngineScrapeResult> {
const timeout = timeToRun ?? 300000; const totalWait = meta.options.waitFor;
const timeout = (timeToRun ?? 300000) + totalWait;
const request: FireEngineScrapeRequestCommon & const request: FireEngineScrapeRequestCommon &
FireEngineScrapeRequestPlaywright = { FireEngineScrapeRequestPlaywright = {
@ -237,7 +238,7 @@ export async function scrapeURLWithFireEnginePlaywright(
request, request,
}), }),
request, request,
timeout + meta.options.waitFor, timeout,
); );
specialtyScrapeCheck( specialtyScrapeCheck(

View File

@ -140,9 +140,23 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom
const { response, tempFilePath } = await downloadFile(meta.id, meta.url); const { response, tempFilePath } = await downloadFile(meta.id, meta.url);
let result: PDFProcessorResult | null = null; let result: PDFProcessorResult | null = null;
if (process.env.LLAMAPARSE_API_KEY) {
// First, try parsing with PdfParse
result = await scrapePDFWithParsePDF(
{
...meta,
logger: meta.logger.child({
method: "scrapePDF/scrapePDFWithParsePDF",
}),
},
tempFilePath,
);
// If the parsed text is under 500 characters and LLAMAPARSE_API_KEY exists, try LlamaParse
if (result.markdown && result.markdown.length < 500 && process.env.LLAMAPARSE_API_KEY) {
try { try {
result = await scrapePDFWithLlamaParse( const llamaResult = await scrapePDFWithLlamaParse(
{ {
...meta, ...meta,
logger: meta.logger.child({ logger: meta.logger.child({
@ -152,16 +166,17 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom
tempFilePath, tempFilePath,
timeToRun, timeToRun,
); );
result = llamaResult; // Use LlamaParse result if successful
} catch (error) { } catch (error) {
if (error instanceof Error && error.message === "LlamaParse timed out") { if (error instanceof Error && error.message === "LlamaParse timed out") {
meta.logger.warn("LlamaParse timed out -- falling back to parse-pdf", { meta.logger.warn("LlamaParse timed out -- using parse-pdf result", {
error, error,
}); });
} else if (error instanceof RemoveFeatureError) { } else if (error instanceof RemoveFeatureError) {
throw error; throw error;
} else { } else {
meta.logger.warn( meta.logger.warn(
"LlamaParse failed to parse PDF -- falling back to parse-pdf", "LlamaParse failed to parse PDF -- using parse-pdf result",
{ error }, { error },
); );
Sentry.captureException(error); Sentry.captureException(error);
@ -169,18 +184,6 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom
} }
} }
if (result === null) {
result = await scrapePDFWithParsePDF(
{
...meta,
logger: meta.logger.child({
method: "scrapePDF/scrapePDFWithParsePDF",
}),
},
tempFilePath,
);
}
await fs.unlink(tempFilePath); await fs.unlink(tempFilePath);
return { return {
@ -190,4 +193,4 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom
html: result.html, html: result.html,
markdown: result.markdown, markdown: result.markdown,
}; };
} }

View File

@ -204,7 +204,7 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
let result: EngineScrapeResultWithContext | null = null; let result: EngineScrapeResultWithContext | null = null;
const timeToRun = meta.options.timeout !== undefined const timeToRun = meta.options.timeout !== undefined
? Math.round(meta.options.timeout / Math.min(fallbackList.length, 3)) ? Math.round(meta.options.timeout / Math.min(fallbackList.length, 2))
: undefined : undefined
for (const { engine, unsupportedFeatures } of fallbackList) { for (const { engine, unsupportedFeatures } of fallbackList) {

View File

@ -3,6 +3,7 @@
"rootDir": "./src", "rootDir": "./src",
"lib": ["ES2022", "DOM"], "lib": ["ES2022", "DOM"],
// or higher // or higher
"target": "ES2022", "target": "ES2022",
@ -18,7 +19,7 @@
"*": ["node_modules/*", "src/types/*"], "*": ["node_modules/*", "src/types/*"],
}, },
"inlineSources": true "inlineSources": true,
}, },
"include": ["src/","src/**/*", "services/db/supabase.ts", "utils/utils.ts", "services/db/supabaseEmbeddings.ts", "utils/EventEmmitter.ts", "src/services/queue-service.ts"] "include": ["src/","src/**/*", "services/db/supabase.ts", "utils/utils.ts", "services/db/supabaseEmbeddings.ts", "utils/EventEmmitter.ts", "src/services/queue-service.ts"]
} }