From 139e2c9a0591bbbbf09554fccb11ea5c23bd8163 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Mon, 16 Dec 2024 22:24:00 +0100 Subject: [PATCH 01/10] fix(runWebScraper): proper error handling --- apps/api/src/main/runWebScraper.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 83e899bb..5fb574d4 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -96,7 +96,6 @@ export async function runWebScraper({ ...internalOptions, }); if (!response.success) { - error = response.error; if (response.error instanceof Error) { throw response.error; } else { @@ -124,7 +123,8 @@ export async function runWebScraper({ // status code is good -- do not attempt retry break; } - } catch (error) { + } catch (_error) { + error = _error; engines = response !== undefined ? response.engines From 0013bdfcb4bd63d6901bc3778fea01371c0276db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Mon, 16 Dec 2024 22:42:51 +0100 Subject: [PATCH 02/10] feat(v1/scrape): add more context to timeout logs --- apps/api/src/controllers/v1/scrape.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/controllers/v1/scrape.ts b/apps/api/src/controllers/v1/scrape.ts index ddd5da74..f1fe3431 100644 --- a/apps/api/src/controllers/v1/scrape.ts +++ b/apps/api/src/controllers/v1/scrape.ts @@ -60,7 +60,7 @@ export async function scrapeController( try { doc = await waitForJob(jobId, timeout + totalWait); // TODO: better types for this } catch (e) { - logger.error(`Error in scrapeController: ${e}`); + logger.error(`Error in scrapeController: ${e}`, { jobId, scrapeId: jobId, startTime }); if ( e instanceof Error && (e.message.startsWith("Job wait") || e.message === "timeout") From 284a6ccedd1baede825571ee933eb7e4f773e2de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Mon, 16 Dec 2024 23:01:34 +0100 Subject: [PATCH 03/10] fix(scrapeURL): better timeToRun distribution --- apps/api/src/scraper/scrapeURL/index.ts | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index 800457a8..31f7e2f2 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -203,15 +203,20 @@ async function scrapeURLLoop(meta: Meta): Promise { const results: EngineResultsTracker = {}; let result: EngineScrapeResultWithContext | null = null; - const timeToRun = meta.options.timeout !== undefined - ? Math.round(meta.options.timeout / Math.min(fallbackList.length, 3)) - : undefined + let ttrInstanceCount = Math.min(fallbackList.length, 3); + let ttrRatios = new Array(ttrInstanceCount).fill(0).map((_, i) => ttrInstanceCount - i); + let ttrRatioSum = ttrRatios.reduce((a, x) => a + x, 0); - for (const { engine, unsupportedFeatures } of fallbackList) { + const timeToRun = meta.options.timeout !== undefined + ? ttrRatios.map(ratio => Math.round(meta.options.timeout! * ratio / ttrRatioSum)).map(ratio => isNaN(ratio) ? undefined : ratio) + : [undefined] + + for (const i in fallbackList) { + const { engine, unsupportedFeatures } = fallbackList[i]; const startedAt = Date.now(); try { meta.logger.info("Scraping via " + engine + "..."); - const _engineResult = await scrapeURLWithEngine(meta, engine, timeToRun); + const _engineResult = await scrapeURLWithEngine(meta, engine, timeToRun[i] ?? timeToRun.slice(-1)[0]); if (_engineResult.markdown === undefined) { // Some engines emit Markdown directly. _engineResult.markdown = await parseMarkdown(_engineResult.html); From 7f57c868be83f2c1c5e92f0eaba81dcdde0f1836 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Mon, 16 Dec 2024 23:08:20 +0100 Subject: [PATCH 04/10] Revert "fix(scrapeURL): better timeToRun distribution" This reverts commit 284a6ccedd1baede825571ee933eb7e4f773e2de. --- apps/api/src/scraper/scrapeURL/index.ts | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index 31f7e2f2..800457a8 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -203,20 +203,15 @@ async function scrapeURLLoop(meta: Meta): Promise { const results: EngineResultsTracker = {}; let result: EngineScrapeResultWithContext | null = null; - let ttrInstanceCount = Math.min(fallbackList.length, 3); - let ttrRatios = new Array(ttrInstanceCount).fill(0).map((_, i) => ttrInstanceCount - i); - let ttrRatioSum = ttrRatios.reduce((a, x) => a + x, 0); - const timeToRun = meta.options.timeout !== undefined - ? ttrRatios.map(ratio => Math.round(meta.options.timeout! * ratio / ttrRatioSum)).map(ratio => isNaN(ratio) ? undefined : ratio) - : [undefined] + ? Math.round(meta.options.timeout / Math.min(fallbackList.length, 3)) + : undefined - for (const i in fallbackList) { - const { engine, unsupportedFeatures } = fallbackList[i]; + for (const { engine, unsupportedFeatures } of fallbackList) { const startedAt = Date.now(); try { meta.logger.info("Scraping via " + engine + "..."); - const _engineResult = await scrapeURLWithEngine(meta, engine, timeToRun[i] ?? timeToRun.slice(-1)[0]); + const _engineResult = await scrapeURLWithEngine(meta, engine, timeToRun); if (_engineResult.markdown === undefined) { // Some engines emit Markdown directly. _engineResult.markdown = await parseMarkdown(_engineResult.html); From 47b968fedea4e5bfe38b80891b0c983b698f204a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 17 Dec 2024 13:17:55 +0100 Subject: [PATCH 05/10] fix(scrapeURL/fire-engine): timeout calculation issues --- .../scrapeURL/engines/fire-engine/index.ts | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts index a2deeed2..2dc134c9 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts @@ -123,8 +123,13 @@ export async function scrapeURLWithFireEngineChromeCDP( // Include specified actions ...(meta.options.actions ?? []), ]; + + const totalWait = actions.reduce( + (a, x) => (x.type === "wait" ? (x.milliseconds ?? 1000) + a : a), + 0, + ); - const timeout = timeToRun ?? 300000; + const timeout = (timeToRun ?? 300000) + totalWait; const request: FireEngineScrapeRequestCommon & FireEngineScrapeRequestChromeCDP = { @@ -146,18 +151,13 @@ export async function scrapeURLWithFireEngineChromeCDP( // TODO: scrollXPaths }; - const totalWait = actions.reduce( - (a, x) => (x.type === "wait" ? (x.milliseconds ?? 1000) + a : a), - 0, - ); - let response = await performFireEngineScrape( meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", request, }), request, - timeout + totalWait, + timeout, ); specialtyScrapeCheck( @@ -213,7 +213,8 @@ export async function scrapeURLWithFireEnginePlaywright( meta: Meta, timeToRun: number | undefined, ): Promise { - const timeout = timeToRun ?? 300000; + const totalWait = meta.options.waitFor; + const timeout = (timeToRun ?? 300000) + totalWait; const request: FireEngineScrapeRequestCommon & FireEngineScrapeRequestPlaywright = { @@ -237,7 +238,7 @@ export async function scrapeURLWithFireEnginePlaywright( request, }), request, - timeout + meta.options.waitFor, + timeout, ); specialtyScrapeCheck( From 654d6c6e0b128e65f41f40b419b957428ad5b659 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 17 Dec 2024 13:21:24 +0100 Subject: [PATCH 06/10] fix(scrapeURL): increase timeToRun --- apps/api/src/scraper/scrapeURL/index.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index 800457a8..93bdb71b 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -204,7 +204,7 @@ async function scrapeURLLoop(meta: Meta): Promise { let result: EngineScrapeResultWithContext | null = null; const timeToRun = meta.options.timeout !== undefined - ? Math.round(meta.options.timeout / Math.min(fallbackList.length, 3)) + ? Math.round(meta.options.timeout / Math.min(fallbackList.length, 2)) : undefined for (const { engine, unsupportedFeatures } of fallbackList) { From ed7d15d2af23409e351df5a3574ef565f06a29d6 Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Tue, 17 Dec 2024 09:50:29 -0500 Subject: [PATCH 07/10] Update index.ts --- .../scraper/scrapeURL/engines/pdf/index.ts | 35 ++++++++++--------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts index 24d5f002..38dc6b5e 100644 --- a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts @@ -140,9 +140,23 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom const { response, tempFilePath } = await downloadFile(meta.id, meta.url); let result: PDFProcessorResult | null = null; - if (process.env.LLAMAPARSE_API_KEY) { + + // First, try parsing with PdfParse + result = await scrapePDFWithParsePDF( + { + ...meta, + logger: meta.logger.child({ + method: "scrapePDF/scrapePDFWithParsePDF", + }), + }, + tempFilePath, + ); + + + // If the parsed text is under 500 characters and LLAMAPARSE_API_KEY exists, try LlamaParse + if (result.markdown && result.markdown.length < 500 && process.env.LLAMAPARSE_API_KEY) { try { - result = await scrapePDFWithLlamaParse( + const llamaResult = await scrapePDFWithLlamaParse( { ...meta, logger: meta.logger.child({ @@ -152,16 +166,17 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom tempFilePath, timeToRun, ); + result = llamaResult; // Use LlamaParse result if successful } catch (error) { if (error instanceof Error && error.message === "LlamaParse timed out") { - meta.logger.warn("LlamaParse timed out -- falling back to parse-pdf", { + meta.logger.warn("LlamaParse timed out -- using parse-pdf result", { error, }); } else if (error instanceof RemoveFeatureError) { throw error; } else { meta.logger.warn( - "LlamaParse failed to parse PDF -- falling back to parse-pdf", + "LlamaParse failed to parse PDF -- using parse-pdf result", { error }, ); Sentry.captureException(error); @@ -169,18 +184,6 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom } } - if (result === null) { - result = await scrapePDFWithParsePDF( - { - ...meta, - logger: meta.logger.child({ - method: "scrapePDF/scrapePDFWithParsePDF", - }), - }, - tempFilePath, - ); - } - await fs.unlink(tempFilePath); return { From 1402831a0aef09b5a74fed5711efd4627af34e2f Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Tue, 17 Dec 2024 09:59:52 -0500 Subject: [PATCH 08/10] Replace pdf parse with pdf to md --- apps/api/package.json | 1 + apps/api/pnpm-lock.yaml | 288 ++++++++++++++++++ .../scraper/scrapeURL/engines/pdf/index.ts | 15 +- 3 files changed, 297 insertions(+), 7 deletions(-) diff --git a/apps/api/package.json b/apps/api/package.json index c4e70901..f4ac48ff 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -58,6 +58,7 @@ "@devil7softwares/pos": "^1.0.2", "@dqbd/tiktoken": "^1.0.17", "@nangohq/node": "^0.40.8", + "@opendocsg/pdf2md": "^0.2.1", "@sentry/cli": "^2.33.1", "@sentry/node": "^8.26.0", "@sentry/profiling-node": "^8.26.0", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 6d971708..68d43655 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -29,6 +29,9 @@ importers: '@nangohq/node': specifier: ^0.40.8 version: 0.40.8 + '@opendocsg/pdf2md': + specifier: ^0.2.1 + version: 0.2.1 '@sentry/cli': specifier: ^2.33.1 version: 2.33.1 @@ -794,6 +797,10 @@ packages: resolution: {integrity: sha512-cXWgKE3sdWLSqAa8ykbCcUsUF1Kyr5J3HOWYGuobhPEycXW4WI++d5DhzdpL238mzoEXTi90VqfSCra37l5YqA==} engines: {node: '>=18'} + '@mapbox/node-pre-gyp@1.0.11': + resolution: {integrity: sha512-Yhlar6v9WQgUp/He7BdgzOz8lqMQ8sU+jkCq7Wx8Myc5YFJLbEe7lgui/V7G1qB1DJykHSGwreceSaD60Y0PUQ==} + hasBin: true + '@mixmark-io/domino@2.2.0': resolution: {integrity: sha512-Y28PR25bHXUg88kCV7nivXrP2Nj2RueZ3/l/jdx6J9f8J4nsEGcgX0Qe6lt7Pa+J79+kPiJU3LguR6O/6zrLOw==} @@ -837,6 +844,10 @@ packages: '@one-ini/wasm@0.1.1': resolution: {integrity: sha512-XuySG1E38YScSJoMlqovLru4KTUNSjgVTIjyh7qMX6aNN5HY5Ct5LhRJdxO79JtTzKfzV/bnWpz+zquYrISsvw==} + '@opendocsg/pdf2md@0.2.1': + resolution: {integrity: sha512-k/yvfrTb+GPTIIm/bMm5IsenTqAFl+IqvkBgFwFlmflS5TT7FOfyRLp8MypVWLAG4G9AnT7AZFbdQYgN/CR5BA==} + hasBin: true + '@opentelemetry/api-logs@0.52.1': resolution: {integrity: sha512-qnSqB2DQ9TPP96dl8cDubDvrUyWc0/sK81xHTK8eSUspzDM3bsewX903qclQFvVhgStjRWdC5bLb3kQqMkfV5A==} engines: {node: '>=14'} @@ -1615,6 +1626,9 @@ packages: resolution: {integrity: sha512-2WALfTl4xo2SkGCYRt6rDTFfk9R1czmBvUQy12gK2KuRKIpWEhcbbzy8EZXtz/jkRqHX8bFEc6FC1HjX4TUWYw==} engines: {node: '>=10.0.0'} + abbrev@1.1.1: + resolution: {integrity: sha512-nne9/IiQ/hzIhY6pdDnbBtz7DjPTKrY00P/zvPSm5pOFkl6xuGrGnXn/VtTNNfNtAfZ9/1RtehkszU9qcTii0Q==} + abbrev@2.0.0: resolution: {integrity: sha512-6/mh1E2u2YgEsCHdY0Yx5oW+61gZU+1vXaoiHHrpKeuRNNgFvS+/jrwHiQhB5apAf5oB7UB7E19ol2R2LKH8hQ==} engines: {node: ^14.17.0 || ^16.13.0 || >=18.0.0} @@ -1708,6 +1722,14 @@ packages: resolution: {integrity: sha512-KLy/ugo33KZA7nugtQ7O0E1c8kQ52N3IvD/XgIh4w/Nr28ypfkwDfA67F1ev4N1m5D+BOk1+b2dEJDfpj/VvZg==} engines: {node: '>=0.2.6'} + aproba@2.0.0: + resolution: {integrity: sha512-lYe4Gx7QT+MKGbDsA+Z+he/Wtef0BiwDOlK/XkBrdfsh9J/jPPXbX0tE9x9cl27Tmu5gg3QUbUrQYa/y+KOHPQ==} + + are-we-there-yet@2.0.0: + resolution: {integrity: sha512-Ci/qENmwHnsYo9xKIcUJN5LeDKdJ6R1Z1j9V/J5wyq8nh/mYPEpIKJbBZXtZjG04HiK7zV/p6Vs9952MrMeUIw==} + engines: {node: '>=10'} + deprecated: This package is no longer supported. + arg@4.1.3: resolution: {integrity: sha512-58S9QDqG0Xx27YwPSt9fJxivjYl432YCwfDMfZ+71RAqUrZef7LrKQZ3LHLOwCS4FLNBplP533Zx895SeOCHvA==} @@ -1908,6 +1930,10 @@ packages: caniuse-lite@1.0.30001627: resolution: {integrity: sha512-4zgNiB8nTyV/tHhwZrFs88ryjls/lHiqFhrxCW4qSTeuRByBVnPYpDInchOIySWknznucaf31Z4KYqjfbrecVw==} + canvas@2.11.2: + resolution: {integrity: sha512-ItanGBMrmRV7Py2Z+Xhs7cT+FNt5K0vPL4p9EZ/UX/Mu7hFbkxSjKF2KVtPwX7UYWp7dRKnrTvReflgrItJbdw==} + engines: {node: '>=6'} + chalk@2.4.2: resolution: {integrity: sha512-Mti+f9lpJNcwF4tWV8/OrTTtF1gZi+f8FqlyAdouralcFWFQWF2+NgCHShjkCb+IFBLq9buZwE1xckQU4peSuQ==} engines: {node: '>=4'} @@ -1938,6 +1964,10 @@ packages: resolution: {integrity: sha512-7VT13fmjotKpGipCW9JEQAusEPE+Ei8nl6/g4FBAmIm0GOOLMua9NDDo/DWp0ZAxCr3cPq5ZpBqmPAQgDda2Pw==} engines: {node: '>= 8.10.0'} + chownr@2.0.0: + resolution: {integrity: sha512-bIomtDF5KGpdogkLd9VspvFzk9KfpyyGlS8YFVZl7TGPBHL5snIOnxeshwVgPteQ9b4Eydl+pVbIyE1DcvCWgQ==} + engines: {node: '>=10'} + chownr@3.0.0: resolution: {integrity: sha512-+IxzY9BZOQd/XuYPRmrvEVjF/nqj5kgT4kEq7VofrDoM1MxoRjEWkrCC3EtLi59TVawxTAn+orJwFQcrqEN1+g==} engines: {node: '>=18'} @@ -1995,6 +2025,10 @@ packages: color-string@1.9.1: resolution: {integrity: sha512-shrVawQFojnZv6xM40anx4CkoDP+fZsw/ZerEMsW/pyzsRbElpsL/DBVW7q3ExxwusdNXI3lXpuhEZkzs8p5Eg==} + color-support@1.1.3: + resolution: {integrity: sha512-qiBjkpbMLO/HL68y+lh4q0/O1MZFj2RX6X/KmMa3+gJD3z+WwI1ZzDHysvqHGS3mP6mznPckpXmw1nI9cJjyRg==} + hasBin: true + color@3.2.1: resolution: {integrity: sha512-aBl7dZI9ENN6fUGC7mWpMTPNHmWUSNan9tuWN6ahh5ZLNk9baLJOnSMlrQkHcrfFgz2/RigjUVAjdx36VcemKA==} @@ -2021,6 +2055,9 @@ packages: config-chain@1.1.13: resolution: {integrity: sha512-qj+f8APARXHrM0hraqXYb2/bOVSV4PvJQlNZ/DVj0QrmNM2q2euizkeuVckQ57J+W0mRH6Hvi+k50M4Jul2VRQ==} + console-control-strings@1.1.0: + resolution: {integrity: sha512-ty/fTekppD2fIwRvnZAVdeOiGd1c7YXEixbgJTNzqcxJWKQnjJ/V1bNEEE6hygpM3WjwHFUVK6HTjWSzV4a8sQ==} + content-disposition@0.5.4: resolution: {integrity: sha512-FveZTNuGw04cxlAiWbzi6zTAL/lhehaWbTtgluJh4/E95DqMwTmha3KZN1aAWA8cFIhHzMZUvLevkw5Rqk+tSQ==} engines: {node: '>= 0.6'} @@ -2147,6 +2184,10 @@ packages: resolution: {integrity: sha512-9iE1PgSik9HeIIw2JO94IidnE3eBoQrFJ3w7sFuzSX4DpmZ3v5sZpUiV5Swcf6mQEF+Y0ru8Neo+p+nyh2J+hQ==} engines: {node: '>=10'} + decompress-response@4.2.1: + resolution: {integrity: sha512-jOSne2qbyE+/r8G1VU+G/82LBs2Fs4LAsTiLSHOCOMZQl2OKZ6i8i4IyHemTe+/yIXOtTcRQMzPcgyhoFlqPkw==} + engines: {node: '>=8'} + dedent@1.5.3: resolution: {integrity: sha512-NHQtfOOW68WD8lgypbLA5oT+Bt0xXJhiYvoR6SmmNXZfpzOGXwdKWmcwG8N7PwVVWV3eF/68nmD9BaJSsTBhyQ==} peerDependencies: @@ -2171,6 +2212,9 @@ packages: resolution: {integrity: sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==} engines: {node: '>=0.4.0'} + delegates@1.0.0: + resolution: {integrity: sha512-bd2L678uiWATM6m5Z1VzNCErI3jiGzt6HGY8OVICs40JQq/HALfbyNJmp0UDakEY4pMMaN0Ly5om/B1VI/+xfQ==} + denque@2.1.0: resolution: {integrity: sha512-HVQE3AAb/pxF8fQAoiqpvg9i3evqug3hoiwakOyZAwJm+6vZehbkYXZ0l4JxS+I3QxM97v5aaRNhj8v5oBhekw==} engines: {node: '>=0.10'} @@ -2283,6 +2327,9 @@ packages: resolution: {integrity: sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==} engines: {node: '>=0.12'} + enumify@1.0.4: + resolution: {integrity: sha512-5mwWXaVzJaqyUdOW/PDH5QySRgmQ8VvujmxmvXoXj9w0n+6omhVuyD56eI37FMqy/LxueJzsQ4DrHVQzuT/TXg==} + env-paths@2.2.1: resolution: {integrity: sha512-+h1lkLKhZMTYjog1VEpJNG7NZJWcuc2DDk/qsqSTRRCOXiLjeQ1d1/udrUGhqMxUgAlwKNZ0cf2uqan5GLuS2A==} engines: {node: '>=6'} @@ -2484,6 +2531,10 @@ packages: resolution: {integrity: sha512-PmDi3uwK5nFuXh7XDTlVnS17xJS7vW36is2+w3xcv8SVxiB4NyATf4ctkVY5bkSjX0Y4nbvZCq1/EjtEyr9ktw==} engines: {node: '>=14.14'} + fs-minipass@2.1.0: + resolution: {integrity: sha512-V/JgOLFCS+R6Vcq0slCuaeWEdNC3ouDlJMNIsacH2VtALiu9mV4LPrHc5cDl8k5aw6J8jwgWWpiTo5RYhmIzvg==} + engines: {node: '>= 8'} + fs.realpath@1.0.0: resolution: {integrity: sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw==} @@ -2495,6 +2546,11 @@ packages: function-bind@1.1.2: resolution: {integrity: sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==} + gauge@3.0.2: + resolution: {integrity: sha512-+5J6MS/5XksCuXq++uFRsnUd7Ovu1XenbeuIuNRJxYWjgQbPuFhT14lAvsWfqfAmnwluf1OwMjz39HjfLPci0Q==} + engines: {node: '>=10'} + deprecated: This package is no longer supported. + generic-pool@3.9.0: resolution: {integrity: sha512-hymDOu5B53XvN4QT9dBmZxPX4CWhBPPLguTZ9MMFeFa/Kg0xWVfylOVNlJji/E7yTZWFd/q9GO5TxDLq156D7g==} engines: {node: '>= 4'} @@ -2578,6 +2634,9 @@ packages: resolution: {integrity: sha512-l3LCuF6MgDNwTDKkdYGEihYjt5pRPbEg46rtlmnSPlUbgmB8LOIrKJbYYFBSbnPaJexMKtiPO8hmeRjRz2Td+A==} engines: {node: '>= 0.4'} + has-unicode@2.0.1: + resolution: {integrity: sha512-8Rf9Y83NBReMnx0gFzA8JImQACstCYWUplepDa9xprwwtmgEZUF0h/i5xSA625zB/I37EtrswSST6OXxwaaIJQ==} + hasown@2.0.2: resolution: {integrity: sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==} engines: {node: '>= 0.4'} @@ -3256,6 +3315,10 @@ packages: resolution: {integrity: sha512-zobTr7akeGHnv7eBOXcRgMeCP6+uyYsczwmeRCauvpvaAltgNyTbLH/+VaEAPUeWBT+1GuNmz4wC/6jtQzbbVA==} engines: {node: '>=12'} + make-dir@3.1.0: + resolution: {integrity: sha512-g3FeP20LNwhALb/6Cz6Dd4F2ngze0jz7tbzrD2wAV+o9FeNHe4rL+yK2md0J/fiSf1sa1ADhXqi5+oVwOM/eGw==} + engines: {node: '>=8'} + make-dir@4.0.0: resolution: {integrity: sha512-hXdUTZYIVOt1Ex//jAQi+wTZZpUpwBj/0QsOzqegb3rGMMeJiSEu5xLHnYfBrRV4RH2+OCSOO95Is/7x1WJ4bw==} engines: {node: '>=10'} @@ -3326,6 +3389,10 @@ packages: resolution: {integrity: sha512-OqbOk5oEQeAZ8WXWydlu9HJjz9WVdEIvamMCcXmuqUYjTknH/sqsWvhQ3vgwKFRR1HpjvNBKQ37nbJgYzGqGcg==} engines: {node: '>=6'} + mimic-response@2.1.0: + resolution: {integrity: sha512-wXqjST+SLt7R009ySCglWBCFpjUygmCIfD790/kVbiGmUgfYGuB14PiTd5DwVxSV4NcYHjzMkoj5LjQZwTQLEA==} + engines: {node: '>=8'} + minimatch@3.1.2: resolution: {integrity: sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==} @@ -3344,10 +3411,22 @@ packages: minimist@1.2.8: resolution: {integrity: sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==} + minipass@3.3.6: + resolution: {integrity: sha512-DxiNidxSEK+tHG6zOIklvNOwm3hvCrbUrdtzY74U6HKTJxvIDfOUL5W5P2Ghd3DTkhhKPYGqeNUIh5qcM4YBfw==} + engines: {node: '>=8'} + + minipass@5.0.0: + resolution: {integrity: sha512-3FnjYuehv9k6ovOEbyOswadCDPX1piCfhV8ncmYtHOjuPwylVWsghTLo7rabjC3Rx5xD4HDx8Wm1xnMF7S5qFQ==} + engines: {node: '>=8'} + minipass@7.1.2: resolution: {integrity: sha512-qOOzS1cBTWYF4BH8fVePDBOO9iptMnGUEZwNc/cMWnTV2nVLZ7VoNWEPHkYczZA0pdoA7dl6e7FL659nX9S2aw==} engines: {node: '>=16 || 14 >=14.17'} + minizlib@2.1.2: + resolution: {integrity: sha512-bAxsR8BVfj60DWXHE3u30oHzfl4G7khkSuPW+qvpd7jFRHm7dLxOjUk1EHACJ/hxLY8phGJ0YhYHZo7jil7Qdg==} + engines: {node: '>= 8'} + minizlib@3.0.1: resolution: {integrity: sha512-umcy022ILvb5/3Djuu8LWeqUa8D68JaBzlttKeMWen48SjabqS3iY5w/vzeMzMUNhLDifyhbOwKDSznB1vvrwg==} engines: {node: '>= 18'} @@ -3359,6 +3438,11 @@ packages: resolution: {integrity: sha512-FP+p8RB8OWpF3YZBCrP5gtADmtXApB5AMLn+vdyA+PyxCjrCs00mjyUozssO33cwDeT3wNGdLxJ5M//YqtHAJw==} hasBin: true + mkdirp@1.0.4: + resolution: {integrity: sha512-vVqVZQyf3WLx2Shd0qJ9xuvqgAyKPLAiqITEtqW0oIUjzo3PePDd6fW9iFz30ef7Ysp/oiWqbhszeGWW2T6Gzw==} + engines: {node: '>=10'} + hasBin: true + mkdirp@3.0.1: resolution: {integrity: sha512-+NsyUUAZDmo6YVHzL/stxSu3t9YS1iljliy3BSDrXJ/dkn1KYdmtZODGGjLcc9XLgVVpH4KshHB8XmZgMhaBXg==} engines: {node: '>=10'} @@ -3447,6 +3531,9 @@ packages: resolution: {integrity: sha512-71ippSywq5Yb7/tVYyGbkBggbU8H3u5Rz56fH60jGFgr8uHwxs+aSKeqmluIVzM0m0kB7xQjKS6qPfd0b2ZoqQ==} hasBin: true + nan@2.22.0: + resolution: {integrity: sha512-nbajikzWTMwsW+eSsNm3QwlOs7het9gGJU5dDZzRTQGk03vyBOauxgI4VakDzE0PtsGTmXPsXTbbjVhRwR5mpw==} + natural-compare@1.4.0: resolution: {integrity: sha512-OWND8ei3VtNC9h7V60qff3SVobHr996CTwgxubgyQYEpg290h9J0buyECNNJexkFm5sOajh5G116RYA1c8ZMSw==} @@ -3507,6 +3594,11 @@ packages: engines: {node: '>=8.10.0'} hasBin: true + nopt@5.0.0: + resolution: {integrity: sha512-Tbj67rffqceeLpcRXrT7vKAN8CwfPeIBgM7E6iBkmKLV7bEMwpGgYLGv0jACUsECaa/vuxP0IjEont6umdMgtQ==} + engines: {node: '>=6'} + hasBin: true + nopt@7.2.1: resolution: {integrity: sha512-taM24ViiimT/XntxbPyJQzCG+p4EKOpgD3mxFwW38mGjVUrfERQOeY4EDHjdnptttfHuHQXFx+lTP08Q+mLa/w==} engines: {node: ^14.17.0 || ^16.13.0 || >=18.0.0} @@ -3524,6 +3616,10 @@ packages: resolution: {integrity: sha512-S48WzZW777zhNIrn7gxOlISNAqi9ZC/uQFnRdbeIHhZhCA6UqpkOT8T1G7BvfdgP4Er8gF4sUbaS0i7QvIfCWw==} engines: {node: '>=8'} + npmlog@5.0.1: + resolution: {integrity: sha512-AqZtDUWOMKs1G/8lwylVjrdYgqA4d9nu8hc+0gzRxlDb1I10+FHBGMXs6aiQHFdCUUlqH99MUMuLfzWDNDtfxw==} + deprecated: This package is no longer supported. + nth-check@2.1.1: resolution: {integrity: sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==} @@ -3949,6 +4045,11 @@ packages: resolution: {integrity: sha512-XQBQ3I8W1Cge0Seh+6gjj03LbmRFWuoszgK9ooCpwYIrhhoO80pfq4cUkU5DkknwfOfFteRwlZ56PYOGYyFWdg==} engines: {node: '>= 4'} + rimraf@3.0.2: + resolution: {integrity: sha512-JZkJMZkAGFFPP2YqXZXPbMlMBgsxzE8ILs4lMIX/2o0L9UBw9O/Y3o6wFw/i9YLapcUJWwqbi3kdxIPdC62TIA==} + deprecated: Rimraf versions prior to v4 are no longer supported + hasBin: true + rimraf@5.0.7: resolution: {integrity: sha512-nV6YcJo5wbLW77m+8KjH8aB/7/rxQy9SZ0HY5shnwULfS+9nmTtVXAJET5NdZmCzA4fPI/Hm1wo/Po/4mopOdg==} engines: {node: '>=14.18'} @@ -4019,6 +4120,9 @@ packages: resolution: {integrity: sha512-XGuRDNjXUijsUL0vl6nSD7cwURuzEgglbOaFuZM9g3kwDXOWVTck0jLzjPzGD+TazWbboZYu52/9/XPdUgne9g==} engines: {node: '>= 0.8.0'} + set-blocking@2.0.0: + resolution: {integrity: sha512-KiKBS8AnWGEyLzofFfmvKwpdPzqiy16LvQfK3yv/fVH7Bj13/wl3JSR1J+rfgRE9q7xUJK4qvgS8raSOeLUehw==} + set-function-length@1.2.2: resolution: {integrity: sha512-pgRc4hJ4/sNjWCSS9AmnS40x3bNMDTknHgL5UaMBTMyJnU90EgWh1Rz+MC9eFu4BuN/UwZjKQuY/1v3rM7HMfg==} engines: {node: '>= 0.4'} @@ -4057,6 +4161,12 @@ packages: resolution: {integrity: sha512-bzyZ1e88w9O1iNJbKnOlvYTrWPDl46O1bG0D3XInv+9tkPrxrN8jUUTiFlDkkmKWgn1M6CfIA13SuGqOa9Korw==} engines: {node: '>=14'} + simple-concat@1.0.1: + resolution: {integrity: sha512-cSFtAPtRhljv69IK0hTVZQ+OfE9nePi/rtJmw5UjHeVyVroEqJXP1sFztKUy1qU+xvz3u/sfYJLa947b7nAN2Q==} + + simple-get@3.1.1: + resolution: {integrity: sha512-CQ5LTKGfCpvE1K0n2us+kuMPbk/q0EKl82s4aheV9oXjFEz6W/Y7oQFVJuU6QG77hRT4Ghb5RURteF5vnWjupA==} + simple-swizzle@0.2.2: resolution: {integrity: sha512-JA//kQgZtbuY83m+xT+tXJkmJncGMTFT+C+g2h2R9uxkYIrE2yy9sgmcLhCnw57/WSD+Eh3J97FPEDFnbXnDUg==} @@ -4215,6 +4325,10 @@ packages: tar-stream@3.1.7: resolution: {integrity: sha512-qJj60CXt7IU1Ffyc3NJMjh6EkuCFej46zUqJ4J7pqYlThyd9bO0XBTmcOIhSzZJVWfsLks0+nle/j538YAW9RQ==} + tar@6.2.1: + resolution: {integrity: sha512-DZ4yORTwrbTj/7MZYq2w+/ZFdI6OZ/f9SFHR+71gIVUZhOQPHzVCLpvRnPgyaMpfWxxk/4ONva3GQSyNIKRv6A==} + engines: {node: '>=10'} + tar@7.2.0: resolution: {integrity: sha512-hctwP0Nb4AB60bj8WQgRYaMOuJYRAPMGiQUAotms5igN8ppfQM+IvjQ5HcKu1MaZh2Wy2KWVTe563Yj8dfc14w==} engines: {node: '>=18'} @@ -4369,6 +4483,9 @@ packages: resolution: {integrity: sha512-gptHNQghINnc/vTGIk0SOFGFNXw7JVrlRUtConJRlvaw6DuX0wO5Jeko9sWrMBhh+PsYAZ7oXAiOnf/UKogyiw==} engines: {node: '>= 10.0.0'} + unpdf@0.12.1: + resolution: {integrity: sha512-ktP8+TTLDBrlu/j8rQVNbHoMMpFXzkVAkb1rt/JdshFC3jOHdZjuGCNl/voPL0kraUrUOH7ZC88kVxMvlvDBzA==} + unpipe@1.0.0: resolution: {integrity: sha512-pjy2bYhSsufwWlKwPc+l3cN7+wuJlK6uz0YdJEOlQDbl6jo/YlPi4mb8agUkVC8BF7V8NuzeyPNqRksA3hztKQ==} engines: {node: '>= 0.8'} @@ -4456,6 +4573,9 @@ packages: engines: {node: '>= 8'} hasBin: true + wide-align@1.1.5: + resolution: {integrity: sha512-eDMORYaPNZ4sQIuuYPDHdQvf4gyCF9rEEV/yPxGfwPkRodwEgiMUUXTx/dex+Me0wxx53S+NgUHaP7y3MGlDmg==} + winston-transport@4.8.0: resolution: {integrity: sha512-qxSTKswC6llEMZKgCQdaWgDuMJQnhuvF5f2Nk3SNXc4byfQ+voo2mX1Px9dkNOuR8p0KAjfPG29PuYUSIb+vSA==} engines: {node: '>= 12.0.0'} @@ -5607,6 +5727,22 @@ snapshots: - langchain - openai + '@mapbox/node-pre-gyp@1.0.11': + dependencies: + detect-libc: 2.0.3 + https-proxy-agent: 5.0.1 + make-dir: 3.1.0 + node-fetch: 2.7.0 + nopt: 5.0.0 + npmlog: 5.0.1 + rimraf: 3.0.2 + semver: 7.6.2 + tar: 6.2.1 + transitivePeerDependencies: + - encoding + - supports-color + optional: true + '@mixmark-io/domino@2.2.0': {} '@mongodb-js/saslprep@1.1.7': @@ -5639,6 +5775,15 @@ snapshots: '@one-ini/wasm@0.1.1': {} + '@opendocsg/pdf2md@0.2.1': + dependencies: + enumify: 1.0.4 + minimist: 1.2.8 + unpdf: 0.12.1 + transitivePeerDependencies: + - encoding + - supports-color + '@opentelemetry/api-logs@0.52.1': dependencies: '@opentelemetry/api': 1.9.0 @@ -6673,6 +6818,9 @@ snapshots: '@xmldom/xmldom@0.8.10': {} + abbrev@1.1.1: + optional: true + abbrev@2.0.0: {} abort-controller@3.0.0: @@ -6755,6 +6903,15 @@ snapshots: dependencies: sylvester: 0.0.12 + aproba@2.0.0: + optional: true + + are-we-there-yet@2.0.0: + dependencies: + delegates: 1.0.0 + readable-stream: 3.6.2 + optional: true + arg@4.1.3: {} argparse@1.0.10: @@ -7008,6 +7165,16 @@ snapshots: caniuse-lite@1.0.30001627: {} + canvas@2.11.2: + dependencies: + '@mapbox/node-pre-gyp': 1.0.11 + nan: 2.22.0 + simple-get: 3.1.1 + transitivePeerDependencies: + - encoding + - supports-color + optional: true + chalk@2.4.2: dependencies: ansi-styles: 3.2.1 @@ -7056,6 +7223,9 @@ snapshots: optionalDependencies: fsevents: 2.3.3 + chownr@2.0.0: + optional: true + chownr@3.0.0: {} chromium-bidi@0.5.24(devtools-protocol@0.0.1299070): @@ -7121,6 +7291,9 @@ snapshots: color-name: 1.1.4 simple-swizzle: 0.2.2 + color-support@1.1.3: + optional: true + color@3.2.1: dependencies: color-convert: 1.9.3 @@ -7148,6 +7321,9 @@ snapshots: ini: 1.3.8 proto-list: 1.2.4 + console-control-strings@1.1.0: + optional: true + content-disposition@0.5.4: dependencies: safe-buffer: 5.2.1 @@ -7255,6 +7431,11 @@ snapshots: decamelize@4.0.0: {} + decompress-response@4.2.1: + dependencies: + mimic-response: 2.1.0 + optional: true + dedent@1.5.3: {} deepmerge@4.3.1: {} @@ -7273,6 +7454,9 @@ snapshots: delayed-stream@1.0.0: {} + delegates@1.0.0: + optional: true + denque@2.1.0: {} depd@2.0.0: {} @@ -7364,6 +7548,8 @@ snapshots: entities@4.5.0: {} + enumify@1.0.4: {} + env-paths@2.2.1: {} error-ex@1.3.2: @@ -7589,6 +7775,11 @@ snapshots: jsonfile: 6.1.0 universalify: 2.0.1 + fs-minipass@2.1.0: + dependencies: + minipass: 3.3.6 + optional: true + fs.realpath@1.0.0: {} fsevents@2.3.3: @@ -7596,6 +7787,19 @@ snapshots: function-bind@1.1.2: {} + gauge@3.0.2: + dependencies: + aproba: 2.0.0 + color-support: 1.1.3 + console-control-strings: 1.1.0 + has-unicode: 2.0.1 + object-assign: 4.1.1 + signal-exit: 3.0.7 + string-width: 4.2.3 + strip-ansi: 6.0.1 + wide-align: 1.1.5 + optional: true + generic-pool@3.9.0: {} gensync@1.0.0-beta.2: {} @@ -7683,6 +7887,9 @@ snapshots: has-symbols@1.0.3: {} + has-unicode@2.0.1: + optional: true + hasown@2.0.2: dependencies: function-bind: 1.1.2 @@ -8463,6 +8670,11 @@ snapshots: luxon@3.4.4: {} + make-dir@3.1.0: + dependencies: + semver: 6.3.1 + optional: true + make-dir@4.0.0: dependencies: semver: 7.6.2 @@ -8523,6 +8735,9 @@ snapshots: mimic-fn@2.1.0: {} + mimic-response@2.1.0: + optional: true + minimatch@3.1.2: dependencies: brace-expansion: 1.1.11 @@ -8541,8 +8756,22 @@ snapshots: minimist@1.2.8: {} + minipass@3.3.6: + dependencies: + yallist: 4.0.0 + optional: true + + minipass@5.0.0: + optional: true + minipass@7.1.2: {} + minizlib@2.1.2: + dependencies: + minipass: 3.3.6 + yallist: 4.0.0 + optional: true + minizlib@3.0.1: dependencies: minipass: 7.1.2 @@ -8554,6 +8783,9 @@ snapshots: dependencies: minimist: 1.2.8 + mkdirp@1.0.4: + optional: true + mkdirp@3.0.1: {} ml-array-mean@1.1.6: @@ -8646,6 +8878,9 @@ snapshots: mustache@4.2.0: {} + nan@2.22.0: + optional: true + natural-compare@1.4.0: {} natural@7.0.7(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3): @@ -8726,6 +8961,11 @@ snapshots: touch: 3.1.1 undefsafe: 2.0.5 + nopt@5.0.0: + dependencies: + abbrev: 1.1.1 + optional: true + nopt@7.2.1: dependencies: abbrev: 2.0.0 @@ -8738,6 +8978,14 @@ snapshots: dependencies: path-key: 3.1.1 + npmlog@5.0.1: + dependencies: + are-we-there-yet: 2.0.0 + console-control-strings: 1.1.0 + gauge: 3.0.2 + set-blocking: 2.0.0 + optional: true + nth-check@2.1.1: dependencies: boolbase: 1.0.0 @@ -9210,6 +9458,11 @@ snapshots: retry@0.13.1: {} + rimraf@3.0.2: + dependencies: + glob: 7.2.3 + optional: true + rimraf@5.0.7: dependencies: glob: 10.4.2 @@ -9285,6 +9538,9 @@ snapshots: transitivePeerDependencies: - supports-color + set-blocking@2.0.0: + optional: true + set-function-length@1.2.2: dependencies: define-data-property: 1.1.4 @@ -9321,6 +9577,16 @@ snapshots: signal-exit@4.1.0: {} + simple-concat@1.0.1: + optional: true + + simple-get@3.1.1: + dependencies: + decompress-response: 4.2.1 + once: 1.4.0 + simple-concat: 1.0.1 + optional: true + simple-swizzle@0.2.2: dependencies: is-arrayish: 0.3.2 @@ -9494,6 +9760,16 @@ snapshots: fast-fifo: 1.3.2 streamx: 2.18.0 + tar@6.2.1: + dependencies: + chownr: 2.0.0 + fs-minipass: 2.1.0 + minipass: 5.0.0 + minizlib: 2.1.2 + mkdirp: 1.0.4 + yallist: 4.0.0 + optional: true + tar@7.2.0: dependencies: '@isaacs/fs-minipass': 4.0.1 @@ -9626,6 +9902,13 @@ snapshots: universalify@2.0.1: {} + unpdf@0.12.1: + optionalDependencies: + canvas: 2.11.2 + transitivePeerDependencies: + - encoding + - supports-color + unpipe@1.0.0: {} unstructured-client@0.11.3(zod@3.23.8): @@ -9698,6 +9981,11 @@ snapshots: dependencies: isexe: 2.0.0 + wide-align@1.1.5: + dependencies: + string-width: 4.2.3 + optional: true + winston-transport@4.8.0: dependencies: logform: 2.6.1 diff --git a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts index 38dc6b5e..c3baa694 100644 --- a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts @@ -6,7 +6,7 @@ import { robustFetch } from "../../lib/fetch"; import { z } from "zod"; import * as Sentry from "@sentry/node"; import escapeHtml from "escape-html"; -import PdfParse from "pdf-parse"; +import pdf2md from "@opendocsg/pdf2md"; import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile"; import { RemoveFeatureError } from "../../error"; @@ -113,10 +113,11 @@ async function scrapePDFWithParsePDF( meta: Meta, tempFilePath: string, ): Promise { - meta.logger.debug("Processing PDF document with parse-pdf", { tempFilePath }); + meta.logger.debug("Processing PDF document with pdf2md", { tempFilePath }); - const result = await PdfParse(await fs.readFile(tempFilePath)); - const escaped = escapeHtml(result.text); + const pdfBuffer = await fs.readFile(tempFilePath); + const markdown = await pdf2md(pdfBuffer); + const escaped = escapeHtml(markdown); return { markdown: escaped, @@ -141,7 +142,7 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom let result: PDFProcessorResult | null = null; - // First, try parsing with PdfParse + // First, try parsing with pdf2md result = await scrapePDFWithParsePDF( { ...meta, @@ -169,14 +170,14 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom result = llamaResult; // Use LlamaParse result if successful } catch (error) { if (error instanceof Error && error.message === "LlamaParse timed out") { - meta.logger.warn("LlamaParse timed out -- using parse-pdf result", { + meta.logger.warn("LlamaParse timed out -- using pdf2md result", { error, }); } else if (error instanceof RemoveFeatureError) { throw error; } else { meta.logger.warn( - "LlamaParse failed to parse PDF -- using parse-pdf result", + "LlamaParse failed to parse PDF -- using pdf2md result", { error }, ); Sentry.captureException(error); From 194353af0db6f4734845940589ae233eb1a124f3 Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Tue, 17 Dec 2024 10:04:20 -0500 Subject: [PATCH 09/10] Remove pdf parse --- apps/api/package.json | 1 - apps/api/pnpm-lock.yaml | 7 +++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/apps/api/package.json b/apps/api/package.json index f4ac48ff..1ccfee5e 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -101,7 +101,6 @@ "mongoose": "^8.4.4", "natural": "^7.0.7", "openai": "^4.57.0", - "pdf-parse": "^1.1.1", "pos": "^0.4.2", "posthog-node": "^4.0.1", "promptable": "^0.0.10", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 68d43655..9014c42e 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -158,9 +158,6 @@ importers: openai: specifier: ^4.57.0 version: 4.57.0(zod@3.23.8) - pdf-parse: - specifier: ^1.1.1 - version: 1.1.1 pos: specifier: ^0.4.2 version: 0.4.2 @@ -8927,7 +8924,8 @@ snapshots: node-domexception@1.0.0: {} - node-ensure@0.0.0: {} + node-ensure@0.0.0: + optional: true node-fetch@2.7.0: dependencies: @@ -9157,6 +9155,7 @@ snapshots: node-ensure: 0.0.0 transitivePeerDependencies: - supports-color + optional: true peberminta@0.9.0: {} From a20a003c740307945ba752d31c3912ba485963a5 Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Tue, 17 Dec 2024 12:12:22 -0500 Subject: [PATCH 10/10] revert to pdf parse --- apps/api/package.json | 2 +- apps/api/pnpm-lock.yaml | 309 +----------------- .../scraper/scrapeURL/engines/pdf/index.ts | 17 +- apps/api/tsconfig.json | 3 +- 4 files changed, 22 insertions(+), 309 deletions(-) diff --git a/apps/api/package.json b/apps/api/package.json index 1ccfee5e..c4e70901 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -58,7 +58,6 @@ "@devil7softwares/pos": "^1.0.2", "@dqbd/tiktoken": "^1.0.17", "@nangohq/node": "^0.40.8", - "@opendocsg/pdf2md": "^0.2.1", "@sentry/cli": "^2.33.1", "@sentry/node": "^8.26.0", "@sentry/profiling-node": "^8.26.0", @@ -101,6 +100,7 @@ "mongoose": "^8.4.4", "natural": "^7.0.7", "openai": "^4.57.0", + "pdf-parse": "^1.1.1", "pos": "^0.4.2", "posthog-node": "^4.0.1", "promptable": "^0.0.10", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 9014c42e..569eafd9 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -29,9 +29,6 @@ importers: '@nangohq/node': specifier: ^0.40.8 version: 0.40.8 - '@opendocsg/pdf2md': - specifier: ^0.2.1 - version: 0.2.1 '@sentry/cli': specifier: ^2.33.1 version: 2.33.1 @@ -158,6 +155,9 @@ importers: openai: specifier: ^4.57.0 version: 4.57.0(zod@3.23.8) + pdf-parse: + specifier: ^1.1.1 + version: 1.1.1 pos: specifier: ^0.4.2 version: 0.4.2 @@ -794,10 +794,6 @@ packages: resolution: {integrity: sha512-cXWgKE3sdWLSqAa8ykbCcUsUF1Kyr5J3HOWYGuobhPEycXW4WI++d5DhzdpL238mzoEXTi90VqfSCra37l5YqA==} engines: {node: '>=18'} - '@mapbox/node-pre-gyp@1.0.11': - resolution: {integrity: sha512-Yhlar6v9WQgUp/He7BdgzOz8lqMQ8sU+jkCq7Wx8Myc5YFJLbEe7lgui/V7G1qB1DJykHSGwreceSaD60Y0PUQ==} - hasBin: true - '@mixmark-io/domino@2.2.0': resolution: {integrity: sha512-Y28PR25bHXUg88kCV7nivXrP2Nj2RueZ3/l/jdx6J9f8J4nsEGcgX0Qe6lt7Pa+J79+kPiJU3LguR6O/6zrLOw==} @@ -841,10 +837,6 @@ packages: '@one-ini/wasm@0.1.1': resolution: {integrity: sha512-XuySG1E38YScSJoMlqovLru4KTUNSjgVTIjyh7qMX6aNN5HY5Ct5LhRJdxO79JtTzKfzV/bnWpz+zquYrISsvw==} - '@opendocsg/pdf2md@0.2.1': - resolution: {integrity: sha512-k/yvfrTb+GPTIIm/bMm5IsenTqAFl+IqvkBgFwFlmflS5TT7FOfyRLp8MypVWLAG4G9AnT7AZFbdQYgN/CR5BA==} - hasBin: true - '@opentelemetry/api-logs@0.52.1': resolution: {integrity: sha512-qnSqB2DQ9TPP96dl8cDubDvrUyWc0/sK81xHTK8eSUspzDM3bsewX903qclQFvVhgStjRWdC5bLb3kQqMkfV5A==} engines: {node: '>=14'} @@ -1623,9 +1615,6 @@ packages: resolution: {integrity: sha512-2WALfTl4xo2SkGCYRt6rDTFfk9R1czmBvUQy12gK2KuRKIpWEhcbbzy8EZXtz/jkRqHX8bFEc6FC1HjX4TUWYw==} engines: {node: '>=10.0.0'} - abbrev@1.1.1: - resolution: {integrity: sha512-nne9/IiQ/hzIhY6pdDnbBtz7DjPTKrY00P/zvPSm5pOFkl6xuGrGnXn/VtTNNfNtAfZ9/1RtehkszU9qcTii0Q==} - abbrev@2.0.0: resolution: {integrity: sha512-6/mh1E2u2YgEsCHdY0Yx5oW+61gZU+1vXaoiHHrpKeuRNNgFvS+/jrwHiQhB5apAf5oB7UB7E19ol2R2LKH8hQ==} engines: {node: ^14.17.0 || ^16.13.0 || >=18.0.0} @@ -1719,14 +1708,6 @@ packages: resolution: {integrity: sha512-KLy/ugo33KZA7nugtQ7O0E1c8kQ52N3IvD/XgIh4w/Nr28ypfkwDfA67F1ev4N1m5D+BOk1+b2dEJDfpj/VvZg==} engines: {node: '>=0.2.6'} - aproba@2.0.0: - resolution: {integrity: sha512-lYe4Gx7QT+MKGbDsA+Z+he/Wtef0BiwDOlK/XkBrdfsh9J/jPPXbX0tE9x9cl27Tmu5gg3QUbUrQYa/y+KOHPQ==} - - are-we-there-yet@2.0.0: - resolution: {integrity: sha512-Ci/qENmwHnsYo9xKIcUJN5LeDKdJ6R1Z1j9V/J5wyq8nh/mYPEpIKJbBZXtZjG04HiK7zV/p6Vs9952MrMeUIw==} - engines: {node: '>=10'} - deprecated: This package is no longer supported. - arg@4.1.3: resolution: {integrity: sha512-58S9QDqG0Xx27YwPSt9fJxivjYl432YCwfDMfZ+71RAqUrZef7LrKQZ3LHLOwCS4FLNBplP533Zx895SeOCHvA==} @@ -1927,10 +1908,6 @@ packages: caniuse-lite@1.0.30001627: resolution: {integrity: sha512-4zgNiB8nTyV/tHhwZrFs88ryjls/lHiqFhrxCW4qSTeuRByBVnPYpDInchOIySWknznucaf31Z4KYqjfbrecVw==} - canvas@2.11.2: - resolution: {integrity: sha512-ItanGBMrmRV7Py2Z+Xhs7cT+FNt5K0vPL4p9EZ/UX/Mu7hFbkxSjKF2KVtPwX7UYWp7dRKnrTvReflgrItJbdw==} - engines: {node: '>=6'} - chalk@2.4.2: resolution: {integrity: sha512-Mti+f9lpJNcwF4tWV8/OrTTtF1gZi+f8FqlyAdouralcFWFQWF2+NgCHShjkCb+IFBLq9buZwE1xckQU4peSuQ==} engines: {node: '>=4'} @@ -1961,10 +1938,6 @@ packages: resolution: {integrity: sha512-7VT13fmjotKpGipCW9JEQAusEPE+Ei8nl6/g4FBAmIm0GOOLMua9NDDo/DWp0ZAxCr3cPq5ZpBqmPAQgDda2Pw==} engines: {node: '>= 8.10.0'} - chownr@2.0.0: - resolution: {integrity: sha512-bIomtDF5KGpdogkLd9VspvFzk9KfpyyGlS8YFVZl7TGPBHL5snIOnxeshwVgPteQ9b4Eydl+pVbIyE1DcvCWgQ==} - engines: {node: '>=10'} - chownr@3.0.0: resolution: {integrity: sha512-+IxzY9BZOQd/XuYPRmrvEVjF/nqj5kgT4kEq7VofrDoM1MxoRjEWkrCC3EtLi59TVawxTAn+orJwFQcrqEN1+g==} engines: {node: '>=18'} @@ -2022,10 +1995,6 @@ packages: color-string@1.9.1: resolution: {integrity: sha512-shrVawQFojnZv6xM40anx4CkoDP+fZsw/ZerEMsW/pyzsRbElpsL/DBVW7q3ExxwusdNXI3lXpuhEZkzs8p5Eg==} - color-support@1.1.3: - resolution: {integrity: sha512-qiBjkpbMLO/HL68y+lh4q0/O1MZFj2RX6X/KmMa3+gJD3z+WwI1ZzDHysvqHGS3mP6mznPckpXmw1nI9cJjyRg==} - hasBin: true - color@3.2.1: resolution: {integrity: sha512-aBl7dZI9ENN6fUGC7mWpMTPNHmWUSNan9tuWN6ahh5ZLNk9baLJOnSMlrQkHcrfFgz2/RigjUVAjdx36VcemKA==} @@ -2052,9 +2021,6 @@ packages: config-chain@1.1.13: resolution: {integrity: sha512-qj+f8APARXHrM0hraqXYb2/bOVSV4PvJQlNZ/DVj0QrmNM2q2euizkeuVckQ57J+W0mRH6Hvi+k50M4Jul2VRQ==} - console-control-strings@1.1.0: - resolution: {integrity: sha512-ty/fTekppD2fIwRvnZAVdeOiGd1c7YXEixbgJTNzqcxJWKQnjJ/V1bNEEE6hygpM3WjwHFUVK6HTjWSzV4a8sQ==} - content-disposition@0.5.4: resolution: {integrity: sha512-FveZTNuGw04cxlAiWbzi6zTAL/lhehaWbTtgluJh4/E95DqMwTmha3KZN1aAWA8cFIhHzMZUvLevkw5Rqk+tSQ==} engines: {node: '>= 0.6'} @@ -2181,10 +2147,6 @@ packages: resolution: {integrity: sha512-9iE1PgSik9HeIIw2JO94IidnE3eBoQrFJ3w7sFuzSX4DpmZ3v5sZpUiV5Swcf6mQEF+Y0ru8Neo+p+nyh2J+hQ==} engines: {node: '>=10'} - decompress-response@4.2.1: - resolution: {integrity: sha512-jOSne2qbyE+/r8G1VU+G/82LBs2Fs4LAsTiLSHOCOMZQl2OKZ6i8i4IyHemTe+/yIXOtTcRQMzPcgyhoFlqPkw==} - engines: {node: '>=8'} - dedent@1.5.3: resolution: {integrity: sha512-NHQtfOOW68WD8lgypbLA5oT+Bt0xXJhiYvoR6SmmNXZfpzOGXwdKWmcwG8N7PwVVWV3eF/68nmD9BaJSsTBhyQ==} peerDependencies: @@ -2209,9 +2171,6 @@ packages: resolution: {integrity: sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==} engines: {node: '>=0.4.0'} - delegates@1.0.0: - resolution: {integrity: sha512-bd2L678uiWATM6m5Z1VzNCErI3jiGzt6HGY8OVICs40JQq/HALfbyNJmp0UDakEY4pMMaN0Ly5om/B1VI/+xfQ==} - denque@2.1.0: resolution: {integrity: sha512-HVQE3AAb/pxF8fQAoiqpvg9i3evqug3hoiwakOyZAwJm+6vZehbkYXZ0l4JxS+I3QxM97v5aaRNhj8v5oBhekw==} engines: {node: '>=0.10'} @@ -2324,9 +2283,6 @@ packages: resolution: {integrity: sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==} engines: {node: '>=0.12'} - enumify@1.0.4: - resolution: {integrity: sha512-5mwWXaVzJaqyUdOW/PDH5QySRgmQ8VvujmxmvXoXj9w0n+6omhVuyD56eI37FMqy/LxueJzsQ4DrHVQzuT/TXg==} - env-paths@2.2.1: resolution: {integrity: sha512-+h1lkLKhZMTYjog1VEpJNG7NZJWcuc2DDk/qsqSTRRCOXiLjeQ1d1/udrUGhqMxUgAlwKNZ0cf2uqan5GLuS2A==} engines: {node: '>=6'} @@ -2528,10 +2484,6 @@ packages: resolution: {integrity: sha512-PmDi3uwK5nFuXh7XDTlVnS17xJS7vW36is2+w3xcv8SVxiB4NyATf4ctkVY5bkSjX0Y4nbvZCq1/EjtEyr9ktw==} engines: {node: '>=14.14'} - fs-minipass@2.1.0: - resolution: {integrity: sha512-V/JgOLFCS+R6Vcq0slCuaeWEdNC3ouDlJMNIsacH2VtALiu9mV4LPrHc5cDl8k5aw6J8jwgWWpiTo5RYhmIzvg==} - engines: {node: '>= 8'} - fs.realpath@1.0.0: resolution: {integrity: sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw==} @@ -2543,11 +2495,6 @@ packages: function-bind@1.1.2: resolution: {integrity: sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==} - gauge@3.0.2: - resolution: {integrity: sha512-+5J6MS/5XksCuXq++uFRsnUd7Ovu1XenbeuIuNRJxYWjgQbPuFhT14lAvsWfqfAmnwluf1OwMjz39HjfLPci0Q==} - engines: {node: '>=10'} - deprecated: This package is no longer supported. - generic-pool@3.9.0: resolution: {integrity: sha512-hymDOu5B53XvN4QT9dBmZxPX4CWhBPPLguTZ9MMFeFa/Kg0xWVfylOVNlJji/E7yTZWFd/q9GO5TxDLq156D7g==} engines: {node: '>= 4'} @@ -2631,9 +2578,6 @@ packages: resolution: {integrity: sha512-l3LCuF6MgDNwTDKkdYGEihYjt5pRPbEg46rtlmnSPlUbgmB8LOIrKJbYYFBSbnPaJexMKtiPO8hmeRjRz2Td+A==} engines: {node: '>= 0.4'} - has-unicode@2.0.1: - resolution: {integrity: sha512-8Rf9Y83NBReMnx0gFzA8JImQACstCYWUplepDa9xprwwtmgEZUF0h/i5xSA625zB/I37EtrswSST6OXxwaaIJQ==} - hasown@2.0.2: resolution: {integrity: sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==} engines: {node: '>= 0.4'} @@ -3312,10 +3256,6 @@ packages: resolution: {integrity: sha512-zobTr7akeGHnv7eBOXcRgMeCP6+uyYsczwmeRCauvpvaAltgNyTbLH/+VaEAPUeWBT+1GuNmz4wC/6jtQzbbVA==} engines: {node: '>=12'} - make-dir@3.1.0: - resolution: {integrity: sha512-g3FeP20LNwhALb/6Cz6Dd4F2ngze0jz7tbzrD2wAV+o9FeNHe4rL+yK2md0J/fiSf1sa1ADhXqi5+oVwOM/eGw==} - engines: {node: '>=8'} - make-dir@4.0.0: resolution: {integrity: sha512-hXdUTZYIVOt1Ex//jAQi+wTZZpUpwBj/0QsOzqegb3rGMMeJiSEu5xLHnYfBrRV4RH2+OCSOO95Is/7x1WJ4bw==} engines: {node: '>=10'} @@ -3386,10 +3326,6 @@ packages: resolution: {integrity: sha512-OqbOk5oEQeAZ8WXWydlu9HJjz9WVdEIvamMCcXmuqUYjTknH/sqsWvhQ3vgwKFRR1HpjvNBKQ37nbJgYzGqGcg==} engines: {node: '>=6'} - mimic-response@2.1.0: - resolution: {integrity: sha512-wXqjST+SLt7R009ySCglWBCFpjUygmCIfD790/kVbiGmUgfYGuB14PiTd5DwVxSV4NcYHjzMkoj5LjQZwTQLEA==} - engines: {node: '>=8'} - minimatch@3.1.2: resolution: {integrity: sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==} @@ -3408,22 +3344,10 @@ packages: minimist@1.2.8: resolution: {integrity: sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==} - minipass@3.3.6: - resolution: {integrity: sha512-DxiNidxSEK+tHG6zOIklvNOwm3hvCrbUrdtzY74U6HKTJxvIDfOUL5W5P2Ghd3DTkhhKPYGqeNUIh5qcM4YBfw==} - engines: {node: '>=8'} - - minipass@5.0.0: - resolution: {integrity: sha512-3FnjYuehv9k6ovOEbyOswadCDPX1piCfhV8ncmYtHOjuPwylVWsghTLo7rabjC3Rx5xD4HDx8Wm1xnMF7S5qFQ==} - engines: {node: '>=8'} - minipass@7.1.2: resolution: {integrity: sha512-qOOzS1cBTWYF4BH8fVePDBOO9iptMnGUEZwNc/cMWnTV2nVLZ7VoNWEPHkYczZA0pdoA7dl6e7FL659nX9S2aw==} engines: {node: '>=16 || 14 >=14.17'} - minizlib@2.1.2: - resolution: {integrity: sha512-bAxsR8BVfj60DWXHE3u30oHzfl4G7khkSuPW+qvpd7jFRHm7dLxOjUk1EHACJ/hxLY8phGJ0YhYHZo7jil7Qdg==} - engines: {node: '>= 8'} - minizlib@3.0.1: resolution: {integrity: sha512-umcy022ILvb5/3Djuu8LWeqUa8D68JaBzlttKeMWen48SjabqS3iY5w/vzeMzMUNhLDifyhbOwKDSznB1vvrwg==} engines: {node: '>= 18'} @@ -3435,11 +3359,6 @@ packages: resolution: {integrity: sha512-FP+p8RB8OWpF3YZBCrP5gtADmtXApB5AMLn+vdyA+PyxCjrCs00mjyUozssO33cwDeT3wNGdLxJ5M//YqtHAJw==} hasBin: true - mkdirp@1.0.4: - resolution: {integrity: sha512-vVqVZQyf3WLx2Shd0qJ9xuvqgAyKPLAiqITEtqW0oIUjzo3PePDd6fW9iFz30ef7Ysp/oiWqbhszeGWW2T6Gzw==} - engines: {node: '>=10'} - hasBin: true - mkdirp@3.0.1: resolution: {integrity: sha512-+NsyUUAZDmo6YVHzL/stxSu3t9YS1iljliy3BSDrXJ/dkn1KYdmtZODGGjLcc9XLgVVpH4KshHB8XmZgMhaBXg==} engines: {node: '>=10'} @@ -3528,9 +3447,6 @@ packages: resolution: {integrity: sha512-71ippSywq5Yb7/tVYyGbkBggbU8H3u5Rz56fH60jGFgr8uHwxs+aSKeqmluIVzM0m0kB7xQjKS6qPfd0b2ZoqQ==} hasBin: true - nan@2.22.0: - resolution: {integrity: sha512-nbajikzWTMwsW+eSsNm3QwlOs7het9gGJU5dDZzRTQGk03vyBOauxgI4VakDzE0PtsGTmXPsXTbbjVhRwR5mpw==} - natural-compare@1.4.0: resolution: {integrity: sha512-OWND8ei3VtNC9h7V60qff3SVobHr996CTwgxubgyQYEpg290h9J0buyECNNJexkFm5sOajh5G116RYA1c8ZMSw==} @@ -3591,11 +3507,6 @@ packages: engines: {node: '>=8.10.0'} hasBin: true - nopt@5.0.0: - resolution: {integrity: sha512-Tbj67rffqceeLpcRXrT7vKAN8CwfPeIBgM7E6iBkmKLV7bEMwpGgYLGv0jACUsECaa/vuxP0IjEont6umdMgtQ==} - engines: {node: '>=6'} - hasBin: true - nopt@7.2.1: resolution: {integrity: sha512-taM24ViiimT/XntxbPyJQzCG+p4EKOpgD3mxFwW38mGjVUrfERQOeY4EDHjdnptttfHuHQXFx+lTP08Q+mLa/w==} engines: {node: ^14.17.0 || ^16.13.0 || >=18.0.0} @@ -3613,10 +3524,6 @@ packages: resolution: {integrity: sha512-S48WzZW777zhNIrn7gxOlISNAqi9ZC/uQFnRdbeIHhZhCA6UqpkOT8T1G7BvfdgP4Er8gF4sUbaS0i7QvIfCWw==} engines: {node: '>=8'} - npmlog@5.0.1: - resolution: {integrity: sha512-AqZtDUWOMKs1G/8lwylVjrdYgqA4d9nu8hc+0gzRxlDb1I10+FHBGMXs6aiQHFdCUUlqH99MUMuLfzWDNDtfxw==} - deprecated: This package is no longer supported. - nth-check@2.1.1: resolution: {integrity: sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==} @@ -4042,11 +3949,6 @@ packages: resolution: {integrity: sha512-XQBQ3I8W1Cge0Seh+6gjj03LbmRFWuoszgK9ooCpwYIrhhoO80pfq4cUkU5DkknwfOfFteRwlZ56PYOGYyFWdg==} engines: {node: '>= 4'} - rimraf@3.0.2: - resolution: {integrity: sha512-JZkJMZkAGFFPP2YqXZXPbMlMBgsxzE8ILs4lMIX/2o0L9UBw9O/Y3o6wFw/i9YLapcUJWwqbi3kdxIPdC62TIA==} - deprecated: Rimraf versions prior to v4 are no longer supported - hasBin: true - rimraf@5.0.7: resolution: {integrity: sha512-nV6YcJo5wbLW77m+8KjH8aB/7/rxQy9SZ0HY5shnwULfS+9nmTtVXAJET5NdZmCzA4fPI/Hm1wo/Po/4mopOdg==} engines: {node: '>=14.18'} @@ -4117,9 +4019,6 @@ packages: resolution: {integrity: sha512-XGuRDNjXUijsUL0vl6nSD7cwURuzEgglbOaFuZM9g3kwDXOWVTck0jLzjPzGD+TazWbboZYu52/9/XPdUgne9g==} engines: {node: '>= 0.8.0'} - set-blocking@2.0.0: - resolution: {integrity: sha512-KiKBS8AnWGEyLzofFfmvKwpdPzqiy16LvQfK3yv/fVH7Bj13/wl3JSR1J+rfgRE9q7xUJK4qvgS8raSOeLUehw==} - set-function-length@1.2.2: resolution: {integrity: sha512-pgRc4hJ4/sNjWCSS9AmnS40x3bNMDTknHgL5UaMBTMyJnU90EgWh1Rz+MC9eFu4BuN/UwZjKQuY/1v3rM7HMfg==} engines: {node: '>= 0.4'} @@ -4158,12 +4057,6 @@ packages: resolution: {integrity: sha512-bzyZ1e88w9O1iNJbKnOlvYTrWPDl46O1bG0D3XInv+9tkPrxrN8jUUTiFlDkkmKWgn1M6CfIA13SuGqOa9Korw==} engines: {node: '>=14'} - simple-concat@1.0.1: - resolution: {integrity: sha512-cSFtAPtRhljv69IK0hTVZQ+OfE9nePi/rtJmw5UjHeVyVroEqJXP1sFztKUy1qU+xvz3u/sfYJLa947b7nAN2Q==} - - simple-get@3.1.1: - resolution: {integrity: sha512-CQ5LTKGfCpvE1K0n2us+kuMPbk/q0EKl82s4aheV9oXjFEz6W/Y7oQFVJuU6QG77hRT4Ghb5RURteF5vnWjupA==} - simple-swizzle@0.2.2: resolution: {integrity: sha512-JA//kQgZtbuY83m+xT+tXJkmJncGMTFT+C+g2h2R9uxkYIrE2yy9sgmcLhCnw57/WSD+Eh3J97FPEDFnbXnDUg==} @@ -4322,10 +4215,6 @@ packages: tar-stream@3.1.7: resolution: {integrity: sha512-qJj60CXt7IU1Ffyc3NJMjh6EkuCFej46zUqJ4J7pqYlThyd9bO0XBTmcOIhSzZJVWfsLks0+nle/j538YAW9RQ==} - tar@6.2.1: - resolution: {integrity: sha512-DZ4yORTwrbTj/7MZYq2w+/ZFdI6OZ/f9SFHR+71gIVUZhOQPHzVCLpvRnPgyaMpfWxxk/4ONva3GQSyNIKRv6A==} - engines: {node: '>=10'} - tar@7.2.0: resolution: {integrity: sha512-hctwP0Nb4AB60bj8WQgRYaMOuJYRAPMGiQUAotms5igN8ppfQM+IvjQ5HcKu1MaZh2Wy2KWVTe563Yj8dfc14w==} engines: {node: '>=18'} @@ -4480,9 +4369,6 @@ packages: resolution: {integrity: sha512-gptHNQghINnc/vTGIk0SOFGFNXw7JVrlRUtConJRlvaw6DuX0wO5Jeko9sWrMBhh+PsYAZ7oXAiOnf/UKogyiw==} engines: {node: '>= 10.0.0'} - unpdf@0.12.1: - resolution: {integrity: sha512-ktP8+TTLDBrlu/j8rQVNbHoMMpFXzkVAkb1rt/JdshFC3jOHdZjuGCNl/voPL0kraUrUOH7ZC88kVxMvlvDBzA==} - unpipe@1.0.0: resolution: {integrity: sha512-pjy2bYhSsufwWlKwPc+l3cN7+wuJlK6uz0YdJEOlQDbl6jo/YlPi4mb8agUkVC8BF7V8NuzeyPNqRksA3hztKQ==} engines: {node: '>= 0.8'} @@ -4570,9 +4456,6 @@ packages: engines: {node: '>= 8'} hasBin: true - wide-align@1.1.5: - resolution: {integrity: sha512-eDMORYaPNZ4sQIuuYPDHdQvf4gyCF9rEEV/yPxGfwPkRodwEgiMUUXTx/dex+Me0wxx53S+NgUHaP7y3MGlDmg==} - winston-transport@4.8.0: resolution: {integrity: sha512-qxSTKswC6llEMZKgCQdaWgDuMJQnhuvF5f2Nk3SNXc4byfQ+voo2mX1Px9dkNOuR8p0KAjfPG29PuYUSIb+vSA==} engines: {node: '>= 12.0.0'} @@ -5724,22 +5607,6 @@ snapshots: - langchain - openai - '@mapbox/node-pre-gyp@1.0.11': - dependencies: - detect-libc: 2.0.3 - https-proxy-agent: 5.0.1 - make-dir: 3.1.0 - node-fetch: 2.7.0 - nopt: 5.0.0 - npmlog: 5.0.1 - rimraf: 3.0.2 - semver: 7.6.2 - tar: 6.2.1 - transitivePeerDependencies: - - encoding - - supports-color - optional: true - '@mixmark-io/domino@2.2.0': {} '@mongodb-js/saslprep@1.1.7': @@ -5772,15 +5639,6 @@ snapshots: '@one-ini/wasm@0.1.1': {} - '@opendocsg/pdf2md@0.2.1': - dependencies: - enumify: 1.0.4 - minimist: 1.2.8 - unpdf: 0.12.1 - transitivePeerDependencies: - - encoding - - supports-color - '@opentelemetry/api-logs@0.52.1': dependencies: '@opentelemetry/api': 1.9.0 @@ -6815,9 +6673,6 @@ snapshots: '@xmldom/xmldom@0.8.10': {} - abbrev@1.1.1: - optional: true - abbrev@2.0.0: {} abort-controller@3.0.0: @@ -6900,15 +6755,6 @@ snapshots: dependencies: sylvester: 0.0.12 - aproba@2.0.0: - optional: true - - are-we-there-yet@2.0.0: - dependencies: - delegates: 1.0.0 - readable-stream: 3.6.2 - optional: true - arg@4.1.3: {} argparse@1.0.10: @@ -7162,16 +7008,6 @@ snapshots: caniuse-lite@1.0.30001627: {} - canvas@2.11.2: - dependencies: - '@mapbox/node-pre-gyp': 1.0.11 - nan: 2.22.0 - simple-get: 3.1.1 - transitivePeerDependencies: - - encoding - - supports-color - optional: true - chalk@2.4.2: dependencies: ansi-styles: 3.2.1 @@ -7220,9 +7056,6 @@ snapshots: optionalDependencies: fsevents: 2.3.3 - chownr@2.0.0: - optional: true - chownr@3.0.0: {} chromium-bidi@0.5.24(devtools-protocol@0.0.1299070): @@ -7288,9 +7121,6 @@ snapshots: color-name: 1.1.4 simple-swizzle: 0.2.2 - color-support@1.1.3: - optional: true - color@3.2.1: dependencies: color-convert: 1.9.3 @@ -7318,9 +7148,6 @@ snapshots: ini: 1.3.8 proto-list: 1.2.4 - console-control-strings@1.1.0: - optional: true - content-disposition@0.5.4: dependencies: safe-buffer: 5.2.1 @@ -7428,11 +7255,6 @@ snapshots: decamelize@4.0.0: {} - decompress-response@4.2.1: - dependencies: - mimic-response: 2.1.0 - optional: true - dedent@1.5.3: {} deepmerge@4.3.1: {} @@ -7451,9 +7273,6 @@ snapshots: delayed-stream@1.0.0: {} - delegates@1.0.0: - optional: true - denque@2.1.0: {} depd@2.0.0: {} @@ -7545,8 +7364,6 @@ snapshots: entities@4.5.0: {} - enumify@1.0.4: {} - env-paths@2.2.1: {} error-ex@1.3.2: @@ -7661,7 +7478,7 @@ snapshots: extract-zip@2.0.1: dependencies: - debug: 4.3.4 + debug: 4.3.5 get-stream: 5.2.0 yauzl: 2.10.0 optionalDependencies: @@ -7772,11 +7589,6 @@ snapshots: jsonfile: 6.1.0 universalify: 2.0.1 - fs-minipass@2.1.0: - dependencies: - minipass: 3.3.6 - optional: true - fs.realpath@1.0.0: {} fsevents@2.3.3: @@ -7784,19 +7596,6 @@ snapshots: function-bind@1.1.2: {} - gauge@3.0.2: - dependencies: - aproba: 2.0.0 - color-support: 1.1.3 - console-control-strings: 1.1.0 - has-unicode: 2.0.1 - object-assign: 4.1.1 - signal-exit: 3.0.7 - string-width: 4.2.3 - strip-ansi: 6.0.1 - wide-align: 1.1.5 - optional: true - generic-pool@3.9.0: {} gensync@1.0.0-beta.2: {} @@ -7823,7 +7622,7 @@ snapshots: dependencies: basic-ftp: 5.0.5 data-uri-to-buffer: 6.0.2 - debug: 4.3.4 + debug: 4.3.5 fs-extra: 11.2.0 transitivePeerDependencies: - supports-color @@ -7884,9 +7683,6 @@ snapshots: has-symbols@1.0.3: {} - has-unicode@2.0.1: - optional: true - hasown@2.0.2: dependencies: function-bind: 1.1.2 @@ -7927,7 +7723,7 @@ snapshots: http-proxy-agent@7.0.2: dependencies: agent-base: 7.1.1 - debug: 4.3.4 + debug: 4.3.5 transitivePeerDependencies: - supports-color @@ -7975,7 +7771,7 @@ snapshots: https-proxy-agent@7.0.5: dependencies: agent-base: 7.1.1 - debug: 4.3.4 + debug: 4.3.5 transitivePeerDependencies: - supports-color @@ -8667,11 +8463,6 @@ snapshots: luxon@3.4.4: {} - make-dir@3.1.0: - dependencies: - semver: 6.3.1 - optional: true - make-dir@4.0.0: dependencies: semver: 7.6.2 @@ -8732,9 +8523,6 @@ snapshots: mimic-fn@2.1.0: {} - mimic-response@2.1.0: - optional: true - minimatch@3.1.2: dependencies: brace-expansion: 1.1.11 @@ -8753,22 +8541,8 @@ snapshots: minimist@1.2.8: {} - minipass@3.3.6: - dependencies: - yallist: 4.0.0 - optional: true - - minipass@5.0.0: - optional: true - minipass@7.1.2: {} - minizlib@2.1.2: - dependencies: - minipass: 3.3.6 - yallist: 4.0.0 - optional: true - minizlib@3.0.1: dependencies: minipass: 7.1.2 @@ -8780,9 +8554,6 @@ snapshots: dependencies: minimist: 1.2.8 - mkdirp@1.0.4: - optional: true - mkdirp@3.0.1: {} ml-array-mean@1.1.6: @@ -8875,9 +8646,6 @@ snapshots: mustache@4.2.0: {} - nan@2.22.0: - optional: true - natural-compare@1.4.0: {} natural@7.0.7(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3): @@ -8924,8 +8692,7 @@ snapshots: node-domexception@1.0.0: {} - node-ensure@0.0.0: - optional: true + node-ensure@0.0.0: {} node-fetch@2.7.0: dependencies: @@ -8959,11 +8726,6 @@ snapshots: touch: 3.1.1 undefsafe: 2.0.5 - nopt@5.0.0: - dependencies: - abbrev: 1.1.1 - optional: true - nopt@7.2.1: dependencies: abbrev: 2.0.0 @@ -8976,14 +8738,6 @@ snapshots: dependencies: path-key: 3.1.1 - npmlog@5.0.1: - dependencies: - are-we-there-yet: 2.0.0 - console-control-strings: 1.1.0 - gauge: 3.0.2 - set-blocking: 2.0.0 - optional: true - nth-check@2.1.1: dependencies: boolbase: 1.0.0 @@ -9082,7 +8836,7 @@ snapshots: dependencies: '@tootallnate/quickjs-emscripten': 0.23.0 agent-base: 7.1.1 - debug: 4.3.4 + debug: 4.3.5 get-uri: 6.0.3 http-proxy-agent: 7.0.2 https-proxy-agent: 7.0.5 @@ -9155,7 +8909,6 @@ snapshots: node-ensure: 0.0.0 transitivePeerDependencies: - supports-color - optional: true peberminta@0.9.0: {} @@ -9278,7 +9031,7 @@ snapshots: proxy-agent@6.4.0: dependencies: agent-base: 7.1.1 - debug: 4.3.4 + debug: 4.3.5 http-proxy-agent: 7.0.2 https-proxy-agent: 7.0.5 lru-cache: 7.18.3 @@ -9457,11 +9210,6 @@ snapshots: retry@0.13.1: {} - rimraf@3.0.2: - dependencies: - glob: 7.2.3 - optional: true - rimraf@5.0.7: dependencies: glob: 10.4.2 @@ -9537,9 +9285,6 @@ snapshots: transitivePeerDependencies: - supports-color - set-blocking@2.0.0: - optional: true - set-function-length@1.2.2: dependencies: define-data-property: 1.1.4 @@ -9576,16 +9321,6 @@ snapshots: signal-exit@4.1.0: {} - simple-concat@1.0.1: - optional: true - - simple-get@3.1.1: - dependencies: - decompress-response: 4.2.1 - once: 1.4.0 - simple-concat: 1.0.1 - optional: true - simple-swizzle@0.2.2: dependencies: is-arrayish: 0.3.2 @@ -9603,7 +9338,7 @@ snapshots: socks-proxy-agent@8.0.4: dependencies: agent-base: 7.1.1 - debug: 4.3.4 + debug: 4.3.5 socks: 2.8.3 transitivePeerDependencies: - supports-color @@ -9759,16 +9494,6 @@ snapshots: fast-fifo: 1.3.2 streamx: 2.18.0 - tar@6.2.1: - dependencies: - chownr: 2.0.0 - fs-minipass: 2.1.0 - minipass: 5.0.0 - minizlib: 2.1.2 - mkdirp: 1.0.4 - yallist: 4.0.0 - optional: true - tar@7.2.0: dependencies: '@isaacs/fs-minipass': 4.0.1 @@ -9901,13 +9626,6 @@ snapshots: universalify@2.0.1: {} - unpdf@0.12.1: - optionalDependencies: - canvas: 2.11.2 - transitivePeerDependencies: - - encoding - - supports-color - unpipe@1.0.0: {} unstructured-client@0.11.3(zod@3.23.8): @@ -9980,11 +9698,6 @@ snapshots: dependencies: isexe: 2.0.0 - wide-align@1.1.5: - dependencies: - string-width: 4.2.3 - optional: true - winston-transport@4.8.0: dependencies: logform: 2.6.1 diff --git a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts index c3baa694..0983e4b1 100644 --- a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts @@ -6,7 +6,7 @@ import { robustFetch } from "../../lib/fetch"; import { z } from "zod"; import * as Sentry from "@sentry/node"; import escapeHtml from "escape-html"; -import pdf2md from "@opendocsg/pdf2md"; +import PdfParse from "pdf-parse"; import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile"; import { RemoveFeatureError } from "../../error"; @@ -113,11 +113,10 @@ async function scrapePDFWithParsePDF( meta: Meta, tempFilePath: string, ): Promise { - meta.logger.debug("Processing PDF document with pdf2md", { tempFilePath }); + meta.logger.debug("Processing PDF document with parse-pdf", { tempFilePath }); - const pdfBuffer = await fs.readFile(tempFilePath); - const markdown = await pdf2md(pdfBuffer); - const escaped = escapeHtml(markdown); + const result = await PdfParse(await fs.readFile(tempFilePath)); + const escaped = escapeHtml(result.text); return { markdown: escaped, @@ -142,7 +141,7 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom let result: PDFProcessorResult | null = null; - // First, try parsing with pdf2md + // First, try parsing with PdfParse result = await scrapePDFWithParsePDF( { ...meta, @@ -170,14 +169,14 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom result = llamaResult; // Use LlamaParse result if successful } catch (error) { if (error instanceof Error && error.message === "LlamaParse timed out") { - meta.logger.warn("LlamaParse timed out -- using pdf2md result", { + meta.logger.warn("LlamaParse timed out -- using parse-pdf result", { error, }); } else if (error instanceof RemoveFeatureError) { throw error; } else { meta.logger.warn( - "LlamaParse failed to parse PDF -- using pdf2md result", + "LlamaParse failed to parse PDF -- using parse-pdf result", { error }, ); Sentry.captureException(error); @@ -194,4 +193,4 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom html: result.html, markdown: result.markdown, }; -} +} \ No newline at end of file diff --git a/apps/api/tsconfig.json b/apps/api/tsconfig.json index 29093be6..ab2a9546 100644 --- a/apps/api/tsconfig.json +++ b/apps/api/tsconfig.json @@ -3,6 +3,7 @@ "rootDir": "./src", "lib": ["ES2022", "DOM"], + // or higher "target": "ES2022", @@ -18,7 +19,7 @@ "*": ["node_modules/*", "src/types/*"], }, - "inlineSources": true + "inlineSources": true, }, "include": ["src/","src/**/*", "services/db/supabase.ts", "utils/utils.ts", "services/db/supabaseEmbeddings.ts", "utils/EventEmmitter.ts", "src/services/queue-service.ts"] }