diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 6d971708..569eafd9 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -7478,7 +7478,7 @@ snapshots: extract-zip@2.0.1: dependencies: - debug: 4.3.4 + debug: 4.3.5 get-stream: 5.2.0 yauzl: 2.10.0 optionalDependencies: @@ -7622,7 +7622,7 @@ snapshots: dependencies: basic-ftp: 5.0.5 data-uri-to-buffer: 6.0.2 - debug: 4.3.4 + debug: 4.3.5 fs-extra: 11.2.0 transitivePeerDependencies: - supports-color @@ -7723,7 +7723,7 @@ snapshots: http-proxy-agent@7.0.2: dependencies: agent-base: 7.1.1 - debug: 4.3.4 + debug: 4.3.5 transitivePeerDependencies: - supports-color @@ -7771,7 +7771,7 @@ snapshots: https-proxy-agent@7.0.5: dependencies: agent-base: 7.1.1 - debug: 4.3.4 + debug: 4.3.5 transitivePeerDependencies: - supports-color @@ -8836,7 +8836,7 @@ snapshots: dependencies: '@tootallnate/quickjs-emscripten': 0.23.0 agent-base: 7.1.1 - debug: 4.3.4 + debug: 4.3.5 get-uri: 6.0.3 http-proxy-agent: 7.0.2 https-proxy-agent: 7.0.5 @@ -9031,7 +9031,7 @@ snapshots: proxy-agent@6.4.0: dependencies: agent-base: 7.1.1 - debug: 4.3.4 + debug: 4.3.5 http-proxy-agent: 7.0.2 https-proxy-agent: 7.0.5 lru-cache: 7.18.3 @@ -9338,7 +9338,7 @@ snapshots: socks-proxy-agent@8.0.4: dependencies: agent-base: 7.1.1 - debug: 4.3.4 + debug: 4.3.5 socks: 2.8.3 transitivePeerDependencies: - supports-color diff --git a/apps/api/src/controllers/v1/scrape.ts b/apps/api/src/controllers/v1/scrape.ts index ddd5da74..f1fe3431 100644 --- a/apps/api/src/controllers/v1/scrape.ts +++ b/apps/api/src/controllers/v1/scrape.ts @@ -60,7 +60,7 @@ export async function scrapeController( try { doc = await waitForJob(jobId, timeout + totalWait); // TODO: better types for this } catch (e) { - logger.error(`Error in scrapeController: ${e}`); + logger.error(`Error in scrapeController: ${e}`, { jobId, scrapeId: jobId, startTime }); if ( e instanceof Error && (e.message.startsWith("Job wait") || e.message === "timeout") diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 83e899bb..5fb574d4 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -96,7 +96,6 @@ export async function runWebScraper({ ...internalOptions, }); if (!response.success) { - error = response.error; if (response.error instanceof Error) { throw response.error; } else { @@ -124,7 +123,8 @@ export async function runWebScraper({ // status code is good -- do not attempt retry break; } - } catch (error) { + } catch (_error) { + error = _error; engines = response !== undefined ? response.engines diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts index a2deeed2..2dc134c9 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts @@ -123,8 +123,13 @@ export async function scrapeURLWithFireEngineChromeCDP( // Include specified actions ...(meta.options.actions ?? []), ]; + + const totalWait = actions.reduce( + (a, x) => (x.type === "wait" ? (x.milliseconds ?? 1000) + a : a), + 0, + ); - const timeout = timeToRun ?? 300000; + const timeout = (timeToRun ?? 300000) + totalWait; const request: FireEngineScrapeRequestCommon & FireEngineScrapeRequestChromeCDP = { @@ -146,18 +151,13 @@ export async function scrapeURLWithFireEngineChromeCDP( // TODO: scrollXPaths }; - const totalWait = actions.reduce( - (a, x) => (x.type === "wait" ? (x.milliseconds ?? 1000) + a : a), - 0, - ); - let response = await performFireEngineScrape( meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", request, }), request, - timeout + totalWait, + timeout, ); specialtyScrapeCheck( @@ -213,7 +213,8 @@ export async function scrapeURLWithFireEnginePlaywright( meta: Meta, timeToRun: number | undefined, ): Promise { - const timeout = timeToRun ?? 300000; + const totalWait = meta.options.waitFor; + const timeout = (timeToRun ?? 300000) + totalWait; const request: FireEngineScrapeRequestCommon & FireEngineScrapeRequestPlaywright = { @@ -237,7 +238,7 @@ export async function scrapeURLWithFireEnginePlaywright( request, }), request, - timeout + meta.options.waitFor, + timeout, ); specialtyScrapeCheck( diff --git a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts index 24d5f002..0983e4b1 100644 --- a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts @@ -140,9 +140,23 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom const { response, tempFilePath } = await downloadFile(meta.id, meta.url); let result: PDFProcessorResult | null = null; - if (process.env.LLAMAPARSE_API_KEY) { + + // First, try parsing with PdfParse + result = await scrapePDFWithParsePDF( + { + ...meta, + logger: meta.logger.child({ + method: "scrapePDF/scrapePDFWithParsePDF", + }), + }, + tempFilePath, + ); + + + // If the parsed text is under 500 characters and LLAMAPARSE_API_KEY exists, try LlamaParse + if (result.markdown && result.markdown.length < 500 && process.env.LLAMAPARSE_API_KEY) { try { - result = await scrapePDFWithLlamaParse( + const llamaResult = await scrapePDFWithLlamaParse( { ...meta, logger: meta.logger.child({ @@ -152,16 +166,17 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom tempFilePath, timeToRun, ); + result = llamaResult; // Use LlamaParse result if successful } catch (error) { if (error instanceof Error && error.message === "LlamaParse timed out") { - meta.logger.warn("LlamaParse timed out -- falling back to parse-pdf", { + meta.logger.warn("LlamaParse timed out -- using parse-pdf result", { error, }); } else if (error instanceof RemoveFeatureError) { throw error; } else { meta.logger.warn( - "LlamaParse failed to parse PDF -- falling back to parse-pdf", + "LlamaParse failed to parse PDF -- using parse-pdf result", { error }, ); Sentry.captureException(error); @@ -169,18 +184,6 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom } } - if (result === null) { - result = await scrapePDFWithParsePDF( - { - ...meta, - logger: meta.logger.child({ - method: "scrapePDF/scrapePDFWithParsePDF", - }), - }, - tempFilePath, - ); - } - await fs.unlink(tempFilePath); return { @@ -190,4 +193,4 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom html: result.html, markdown: result.markdown, }; -} +} \ No newline at end of file diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index 800457a8..93bdb71b 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -204,7 +204,7 @@ async function scrapeURLLoop(meta: Meta): Promise { let result: EngineScrapeResultWithContext | null = null; const timeToRun = meta.options.timeout !== undefined - ? Math.round(meta.options.timeout / Math.min(fallbackList.length, 3)) + ? Math.round(meta.options.timeout / Math.min(fallbackList.length, 2)) : undefined for (const { engine, unsupportedFeatures } of fallbackList) { diff --git a/apps/api/tsconfig.json b/apps/api/tsconfig.json index 29093be6..ab2a9546 100644 --- a/apps/api/tsconfig.json +++ b/apps/api/tsconfig.json @@ -3,6 +3,7 @@ "rootDir": "./src", "lib": ["ES2022", "DOM"], + // or higher "target": "ES2022", @@ -18,7 +19,7 @@ "*": ["node_modules/*", "src/types/*"], }, - "inlineSources": true + "inlineSources": true, }, "include": ["src/","src/**/*", "services/db/supabase.ts", "utils/utils.ts", "services/db/supabaseEmbeddings.ts", "utils/EventEmmitter.ts", "src/services/queue-service.ts"] }