diff --git a/apps/api/src/__tests__/snips/scrape.test.ts b/apps/api/src/__tests__/snips/scrape.test.ts index 6361f988a..1336a87bd 100644 --- a/apps/api/src/__tests__/snips/scrape.test.ts +++ b/apps/api/src/__tests__/snips/scrape.test.ts @@ -745,6 +745,15 @@ describe("Scrape tests", () => { // text on the last page expect(response.markdown).toContain("Redistribution and use in source and binary forms, with or without modification"); }, 310000); + + it.concurrent("scrapes Google Docs links as PDFs", async () => { + const response = await scrape({ + url: "https://docs.google.com/document/d/1H-hOLYssS8xXl2o5hxj4ipE7yyhZAX1s7ADYM1Hdlzo/view", + timeout: 300000, + }); + + expect(response.markdown).toContain("This is a test to confirm Google Docs scraping abilities."); + }, 310000); }); } diff --git a/apps/api/src/scraper/scrapeURL/engines/cache/index.ts b/apps/api/src/scraper/scrapeURL/engines/cache/index.ts index cdff5134c..49acbcd7b 100644 --- a/apps/api/src/scraper/scrapeURL/engines/cache/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/cache/index.ts @@ -4,7 +4,7 @@ import { Meta } from "../.."; import { EngineError, IndexMissError } from "../../error"; export async function scrapeCache(meta: Meta): Promise { - const key = cacheKey(meta.url, meta.options, meta.internalOptions); + const key = cacheKey(meta.rewrittenUrl ?? meta.url, meta.options, meta.internalOptions); if (key === null) throw new EngineError("Scrape not eligible for caching"); const entry = await getEntryFromCache(key); diff --git a/apps/api/src/scraper/scrapeURL/engines/docx/index.ts b/apps/api/src/scraper/scrapeURL/engines/docx/index.ts index 04b30a0a8..416e77fc4 100644 --- a/apps/api/src/scraper/scrapeURL/engines/docx/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/docx/index.ts @@ -4,7 +4,7 @@ import { downloadFile } from "../utils/downloadFile"; import mammoth from "mammoth"; export async function scrapeDOCX(meta: Meta, timeToRun: number | undefined): Promise { - const { response, tempFilePath } = await downloadFile(meta.id, meta.url, { + const { response, tempFilePath } = await downloadFile(meta.id, meta.rewrittenUrl ?? meta.url, { headers: meta.options.headers, signal: meta.internalOptions.abort ?? AbortSignal.timeout(timeToRun ?? 300000), }); diff --git a/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts b/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts index 299b0d355..c4e487b84 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts @@ -17,7 +17,7 @@ export async function scrapeURLWithFetch( const timeout = timeToRun ?? 300000; const mockOptions = { - url: meta.url, + url: meta.rewrittenUrl ?? meta.url, // irrelevant method: "GET", @@ -55,8 +55,8 @@ export async function scrapeURLWithFetch( } else { try { const x = await Promise.race([ - undici.fetch(meta.url, { - dispatcher: await makeSecureDispatcher(meta.url), + undici.fetch(meta.rewrittenUrl ?? meta.url, { + dispatcher: await makeSecureDispatcher(meta.rewrittenUrl ?? meta.url), redirect: "follow", headers: meta.options.headers, signal: meta.internalOptions.abort ?? AbortSignal.timeout(timeout), diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts index 3e8490c40..84884ec90 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts @@ -209,7 +209,7 @@ export async function scrapeURLWithFireEngineChromeCDP( const request: FireEngineScrapeRequestCommon & FireEngineScrapeRequestChromeCDP = { - url: meta.url, + url: meta.rewrittenUrl ?? meta.url, engine: "chrome-cdp", instantReturn: true, skipTlsVerification: meta.options.skipTlsVerification, @@ -298,7 +298,7 @@ export async function scrapeURLWithFireEnginePlaywright( const request: FireEngineScrapeRequestCommon & FireEngineScrapeRequestPlaywright = { - url: meta.url, + url: meta.rewrittenUrl ?? meta.url, engine: "playwright", instantReturn: true, @@ -359,7 +359,7 @@ export async function scrapeURLWithFireEngineTLSClient( const request: FireEngineScrapeRequestCommon & FireEngineScrapeRequestTLSClient = { - url: meta.url, + url: meta.rewrittenUrl ?? meta.url, engine: "tlsclient", instantReturn: true, diff --git a/apps/api/src/scraper/scrapeURL/engines/index/index.ts b/apps/api/src/scraper/scrapeURL/engines/index/index.ts index fe2043fe0..bf6d1fdb0 100644 --- a/apps/api/src/scraper/scrapeURL/engines/index/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/index/index.ts @@ -87,7 +87,7 @@ export async function sendDocumentToIndex(meta: Meta, document: Document) { url: normalizedURL, url_hash: urlHash, original_url: document.metadata.sourceURL ?? meta.url, - resolved_url: document.metadata.url ?? document.metadata.sourceURL ?? meta.url, + resolved_url: document.metadata.url ?? document.metadata.sourceURL ?? meta.rewrittenUrl ?? meta.url, has_screenshot: document.screenshot !== undefined && meta.featureFlags.has("screenshot"), has_screenshot_fullscreen: document.screenshot !== undefined && meta.featureFlags.has("screenshot@fullScreen"), is_mobile: meta.options.mobile, diff --git a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts index 01ef4db53..3dd97995e 100644 --- a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts @@ -181,14 +181,14 @@ export async function scrapePDF( "base64", ); return { - url: meta.pdfPrefetch.url ?? meta.url, + url: meta.pdfPrefetch.url ?? meta.rewrittenUrl ?? meta.url, statusCode: meta.pdfPrefetch.status, html: content, markdown: content, }; } else { - const file = await fetchFileToBuffer(meta.url, { + const file = await fetchFileToBuffer(meta.rewrittenUrl ?? meta.url, { headers: meta.options.headers, }); @@ -212,7 +212,7 @@ export async function scrapePDF( const { response, tempFilePath } = meta.pdfPrefetch !== undefined && meta.pdfPrefetch !== null ? { response: meta.pdfPrefetch, tempFilePath: meta.pdfPrefetch.filePath } - : await downloadFile(meta.id, meta.url, { + : await downloadFile(meta.id, meta.rewrittenUrl ?? meta.url, { headers: meta.options.headers, }); @@ -298,7 +298,7 @@ export async function scrapePDF( await unlink(tempFilePath); return { - url: response.url ?? meta.url, + url: response.url ?? meta.rewrittenUrl ?? meta.url, statusCode: response.status, html: result?.html ?? "", markdown: result?.markdown ?? "", diff --git a/apps/api/src/scraper/scrapeURL/engines/playwright/index.ts b/apps/api/src/scraper/scrapeURL/engines/playwright/index.ts index 57ae8f6b1..13f47fdfd 100644 --- a/apps/api/src/scraper/scrapeURL/engines/playwright/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/playwright/index.ts @@ -18,7 +18,7 @@ export async function scrapeURLWithPlaywright( "Content-Type": "application/json", }, body: { - url: meta.url, + url: meta.rewrittenUrl ?? meta.url, wait_after_load: meta.options.waitFor, timeout, headers: meta.options.headers, @@ -48,7 +48,7 @@ export async function scrapeURLWithPlaywright( } return { - url: meta.url, // TODO: impove redirect following + url: meta.rewrittenUrl ?? meta.url, // TODO: impove redirect following html: response.content, statusCode: response.pageStatusCode, error: response.pageError, diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index 7fc043b70..73f9ebb2c 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -48,6 +48,7 @@ export type ScrapeUrlResponse = ( export type Meta = { id: string; url: string; + rewrittenUrl?: string; options: ScrapeOptions; internalOptions: InternalOptions; logger: Logger; @@ -156,9 +157,18 @@ async function buildMetaObject( }); const logs: any[] = []; + let rewrittenUrl: string | undefined; + if (url.startsWith("https://docs.google.com/document/d/") || url.startsWith("http://docs.google.com/document/d/")) { + const id = url.match(/\/document\/d\/([-\w]+)/)?.[1]; + if (id) { + rewrittenUrl = `https://docs.google.com/document/d/${id}/export?format=pdf`; + } + } + return { id, url, + rewrittenUrl, options, internalOptions, logger, @@ -233,7 +243,7 @@ function safeguardCircularError(error: T): T { } async function scrapeURLLoop(meta: Meta): Promise { - meta.logger.info(`Scraping URL ${JSON.stringify(meta.url)}...`); + meta.logger.info(`Scraping URL ${JSON.stringify(meta.rewrittenUrl ?? meta.url)}...`); // TODO: handle sitemap data, see WebScraper/index.ts:280 // TODO: ScrapeEvents @@ -441,6 +451,11 @@ export async function scrapeURL( costTracking: CostTracking, ): Promise { const meta = await buildMetaObject(id, url, options, internalOptions, costTracking); + + if (meta.rewrittenUrl) { + meta.logger.info("Rewriting URL"); + } + try { while (true) { try { diff --git a/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts b/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts index 20a6dd7ba..fee2b45d5 100644 --- a/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts +++ b/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts @@ -12,7 +12,7 @@ export async function extractMetadataRust( return { ...fromRust, ...(fromRust.favicon ? { - favicon: new URL(fromRust.favicon, meta.url) + favicon: new URL(fromRust.favicon, meta.rewrittenUrl ?? meta.url) } : {}), scrapeId: meta.id, }; @@ -75,7 +75,7 @@ export async function extractMetadata( soup('link[rel*="icon"]').first().attr("href") || undefined; if (faviconLink) { - const baseUrl = new URL(meta.url).origin; + const baseUrl = new URL(meta.rewrittenUrl ?? meta.url).origin; favicon = faviconLink.startsWith("http") ? faviconLink : `${baseUrl}${faviconLink}`; diff --git a/apps/api/src/scraper/scrapeURL/transformers/diff.ts b/apps/api/src/scraper/scrapeURL/transformers/diff.ts index 6284d9a6f..1d313ce94 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/diff.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/diff.ts @@ -63,12 +63,12 @@ export async function deriveDiff(meta: Meta, document: Document): Promise 100) { - meta.logger.debug("Diffing took a while", { time: end - start, params: { i_team_id: meta.internalOptions.teamId, i_url: document.metadata.sourceURL ?? meta.url } }); + meta.logger.debug("Diffing took a while", { time: end - start, params: { i_team_id: meta.internalOptions.teamId, i_url: document.metadata.sourceURL ?? meta.rewrittenUrl ?? meta.url } }); } const data: { diff --git a/apps/api/src/scraper/scrapeURL/transformers/index.ts b/apps/api/src/scraper/scrapeURL/transformers/index.ts index d91a9c36d..f229793b3 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/index.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/index.ts @@ -48,7 +48,7 @@ export async function deriveHTMLFromRawHTML( document.html = await htmlTransform( document.rawHtml, - document.metadata.url ?? document.metadata.sourceURL ?? meta.url, + document.metadata.url ?? document.metadata.sourceURL ?? meta.rewrittenUrl ?? meta.url, meta.options, ); return document; @@ -88,7 +88,7 @@ export async function deriveLinksFromHTML(meta: Meta, document: Document): Promi ); } - document.links = await extractLinks(document.html, meta.url); + document.links = await extractLinks(document.html, document.metadata.url ?? document.metadata.sourceURL ?? meta.rewrittenUrl ?? meta.url); } return document; diff --git a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts index 53fe0ea71..4ca536965 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts @@ -684,7 +684,7 @@ export async function performLLMExtract( const { extractedDataArray, warning, costLimitExceededTokenUsage } = await extractData({ extractOptions: generationOptions, - urls: [meta.url], + urls: [meta.rewrittenUrl ?? meta.url], useAgent: false, scrapeId: meta.id, }); @@ -760,7 +760,7 @@ export async function performLLMExtract( // // if (shouldUseSmartscrape && smartscrape_prompt) { // // meta.logger.info("Triggering SmartScrape refinement...", { reason: smartscrape_reasoning, prompt: smartscrape_prompt }); // // // Call the smartScrape function (which needs to be implemented/imported) - // // // const smartScrapedDocs = await smartScrape(meta.url, smartscrape_prompt); + // // // const smartScrapedDocs = await smartScrape(meta.rewrittenUrl ?? meta.url, smartscrape_prompt); // // // Process/merge smartScrapedDocs with extractedData // // // ... potentially update finalExtract ... // // } else {