feat(scrape): support Google Docs (FIR-1365) (#1686)

* feat(scrape): support Google Docs

* fixes
This commit is contained in:
Gergő Móricz 2025-06-20 11:42:41 +02:00 committed by GitHub
parent f8983fffb7
commit f939428264
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
13 changed files with 48 additions and 24 deletions

View File

@ -745,6 +745,15 @@ describe("Scrape tests", () => {
// text on the last page
expect(response.markdown).toContain("Redistribution and use in source and binary forms, with or without modification");
}, 310000);
it.concurrent("scrapes Google Docs links as PDFs", async () => {
const response = await scrape({
url: "https://docs.google.com/document/d/1H-hOLYssS8xXl2o5hxj4ipE7yyhZAX1s7ADYM1Hdlzo/view",
timeout: 300000,
});
expect(response.markdown).toContain("This is a test to confirm Google Docs scraping abilities.");
}, 310000);
});
}

View File

@ -4,7 +4,7 @@ import { Meta } from "../..";
import { EngineError, IndexMissError } from "../../error";
export async function scrapeCache(meta: Meta): Promise<EngineScrapeResult> {
const key = cacheKey(meta.url, meta.options, meta.internalOptions);
const key = cacheKey(meta.rewrittenUrl ?? meta.url, meta.options, meta.internalOptions);
if (key === null) throw new EngineError("Scrape not eligible for caching");
const entry = await getEntryFromCache(key);

View File

@ -4,7 +4,7 @@ import { downloadFile } from "../utils/downloadFile";
import mammoth from "mammoth";
export async function scrapeDOCX(meta: Meta, timeToRun: number | undefined): Promise<EngineScrapeResult> {
const { response, tempFilePath } = await downloadFile(meta.id, meta.url, {
const { response, tempFilePath } = await downloadFile(meta.id, meta.rewrittenUrl ?? meta.url, {
headers: meta.options.headers,
signal: meta.internalOptions.abort ?? AbortSignal.timeout(timeToRun ?? 300000),
});

View File

@ -17,7 +17,7 @@ export async function scrapeURLWithFetch(
const timeout = timeToRun ?? 300000;
const mockOptions = {
url: meta.url,
url: meta.rewrittenUrl ?? meta.url,
// irrelevant
method: "GET",
@ -55,8 +55,8 @@ export async function scrapeURLWithFetch(
} else {
try {
const x = await Promise.race([
undici.fetch(meta.url, {
dispatcher: await makeSecureDispatcher(meta.url),
undici.fetch(meta.rewrittenUrl ?? meta.url, {
dispatcher: await makeSecureDispatcher(meta.rewrittenUrl ?? meta.url),
redirect: "follow",
headers: meta.options.headers,
signal: meta.internalOptions.abort ?? AbortSignal.timeout(timeout),

View File

@ -209,7 +209,7 @@ export async function scrapeURLWithFireEngineChromeCDP(
const request: FireEngineScrapeRequestCommon &
FireEngineScrapeRequestChromeCDP = {
url: meta.url,
url: meta.rewrittenUrl ?? meta.url,
engine: "chrome-cdp",
instantReturn: true,
skipTlsVerification: meta.options.skipTlsVerification,
@ -298,7 +298,7 @@ export async function scrapeURLWithFireEnginePlaywright(
const request: FireEngineScrapeRequestCommon &
FireEngineScrapeRequestPlaywright = {
url: meta.url,
url: meta.rewrittenUrl ?? meta.url,
engine: "playwright",
instantReturn: true,
@ -359,7 +359,7 @@ export async function scrapeURLWithFireEngineTLSClient(
const request: FireEngineScrapeRequestCommon &
FireEngineScrapeRequestTLSClient = {
url: meta.url,
url: meta.rewrittenUrl ?? meta.url,
engine: "tlsclient",
instantReturn: true,

View File

@ -87,7 +87,7 @@ export async function sendDocumentToIndex(meta: Meta, document: Document) {
url: normalizedURL,
url_hash: urlHash,
original_url: document.metadata.sourceURL ?? meta.url,
resolved_url: document.metadata.url ?? document.metadata.sourceURL ?? meta.url,
resolved_url: document.metadata.url ?? document.metadata.sourceURL ?? meta.rewrittenUrl ?? meta.url,
has_screenshot: document.screenshot !== undefined && meta.featureFlags.has("screenshot"),
has_screenshot_fullscreen: document.screenshot !== undefined && meta.featureFlags.has("screenshot@fullScreen"),
is_mobile: meta.options.mobile,

View File

@ -181,14 +181,14 @@ export async function scrapePDF(
"base64",
);
return {
url: meta.pdfPrefetch.url ?? meta.url,
url: meta.pdfPrefetch.url ?? meta.rewrittenUrl ?? meta.url,
statusCode: meta.pdfPrefetch.status,
html: content,
markdown: content,
};
} else {
const file = await fetchFileToBuffer(meta.url, {
const file = await fetchFileToBuffer(meta.rewrittenUrl ?? meta.url, {
headers: meta.options.headers,
});
@ -212,7 +212,7 @@ export async function scrapePDF(
const { response, tempFilePath } =
meta.pdfPrefetch !== undefined && meta.pdfPrefetch !== null
? { response: meta.pdfPrefetch, tempFilePath: meta.pdfPrefetch.filePath }
: await downloadFile(meta.id, meta.url, {
: await downloadFile(meta.id, meta.rewrittenUrl ?? meta.url, {
headers: meta.options.headers,
});
@ -298,7 +298,7 @@ export async function scrapePDF(
await unlink(tempFilePath);
return {
url: response.url ?? meta.url,
url: response.url ?? meta.rewrittenUrl ?? meta.url,
statusCode: response.status,
html: result?.html ?? "",
markdown: result?.markdown ?? "",

View File

@ -18,7 +18,7 @@ export async function scrapeURLWithPlaywright(
"Content-Type": "application/json",
},
body: {
url: meta.url,
url: meta.rewrittenUrl ?? meta.url,
wait_after_load: meta.options.waitFor,
timeout,
headers: meta.options.headers,
@ -48,7 +48,7 @@ export async function scrapeURLWithPlaywright(
}
return {
url: meta.url, // TODO: impove redirect following
url: meta.rewrittenUrl ?? meta.url, // TODO: impove redirect following
html: response.content,
statusCode: response.pageStatusCode,
error: response.pageError,

View File

@ -48,6 +48,7 @@ export type ScrapeUrlResponse = (
export type Meta = {
id: string;
url: string;
rewrittenUrl?: string;
options: ScrapeOptions;
internalOptions: InternalOptions;
logger: Logger;
@ -156,9 +157,18 @@ async function buildMetaObject(
});
const logs: any[] = [];
let rewrittenUrl: string | undefined;
if (url.startsWith("https://docs.google.com/document/d/") || url.startsWith("http://docs.google.com/document/d/")) {
const id = url.match(/\/document\/d\/([-\w]+)/)?.[1];
if (id) {
rewrittenUrl = `https://docs.google.com/document/d/${id}/export?format=pdf`;
}
}
return {
id,
url,
rewrittenUrl,
options,
internalOptions,
logger,
@ -233,7 +243,7 @@ function safeguardCircularError<T>(error: T): T {
}
async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
meta.logger.info(`Scraping URL ${JSON.stringify(meta.url)}...`);
meta.logger.info(`Scraping URL ${JSON.stringify(meta.rewrittenUrl ?? meta.url)}...`);
// TODO: handle sitemap data, see WebScraper/index.ts:280
// TODO: ScrapeEvents
@ -441,6 +451,11 @@ export async function scrapeURL(
costTracking: CostTracking,
): Promise<ScrapeUrlResponse> {
const meta = await buildMetaObject(id, url, options, internalOptions, costTracking);
if (meta.rewrittenUrl) {
meta.logger.info("Rewriting URL");
}
try {
while (true) {
try {

View File

@ -12,7 +12,7 @@ export async function extractMetadataRust(
return {
...fromRust,
...(fromRust.favicon ? {
favicon: new URL(fromRust.favicon, meta.url)
favicon: new URL(fromRust.favicon, meta.rewrittenUrl ?? meta.url)
} : {}),
scrapeId: meta.id,
};
@ -75,7 +75,7 @@ export async function extractMetadata(
soup('link[rel*="icon"]').first().attr("href") ||
undefined;
if (faviconLink) {
const baseUrl = new URL(meta.url).origin;
const baseUrl = new URL(meta.rewrittenUrl ?? meta.url).origin;
favicon = faviconLink.startsWith("http")
? faviconLink
: `${baseUrl}${faviconLink}`;

View File

@ -63,12 +63,12 @@ export async function deriveDiff(meta: Meta, document: Document): Promise<Docume
const res = await supabase_service
.rpc("diff_get_last_scrape_4", {
i_team_id: meta.internalOptions.teamId,
i_url: document.metadata.sourceURL ?? meta.url,
i_url: document.metadata.sourceURL ?? meta.rewrittenUrl ?? meta.url,
i_tag: meta.options.changeTrackingOptions?.tag ?? null,
});
const end = Date.now();
if (end - start > 100) {
meta.logger.debug("Diffing took a while", { time: end - start, params: { i_team_id: meta.internalOptions.teamId, i_url: document.metadata.sourceURL ?? meta.url } });
meta.logger.debug("Diffing took a while", { time: end - start, params: { i_team_id: meta.internalOptions.teamId, i_url: document.metadata.sourceURL ?? meta.rewrittenUrl ?? meta.url } });
}
const data: {

View File

@ -48,7 +48,7 @@ export async function deriveHTMLFromRawHTML(
document.html = await htmlTransform(
document.rawHtml,
document.metadata.url ?? document.metadata.sourceURL ?? meta.url,
document.metadata.url ?? document.metadata.sourceURL ?? meta.rewrittenUrl ?? meta.url,
meta.options,
);
return document;
@ -88,7 +88,7 @@ export async function deriveLinksFromHTML(meta: Meta, document: Document): Promi
);
}
document.links = await extractLinks(document.html, meta.url);
document.links = await extractLinks(document.html, document.metadata.url ?? document.metadata.sourceURL ?? meta.rewrittenUrl ?? meta.url);
}
return document;

View File

@ -684,7 +684,7 @@ export async function performLLMExtract(
const { extractedDataArray, warning, costLimitExceededTokenUsage } =
await extractData({
extractOptions: generationOptions,
urls: [meta.url],
urls: [meta.rewrittenUrl ?? meta.url],
useAgent: false,
scrapeId: meta.id,
});
@ -760,7 +760,7 @@ export async function performLLMExtract(
// // if (shouldUseSmartscrape && smartscrape_prompt) {
// // meta.logger.info("Triggering SmartScrape refinement...", { reason: smartscrape_reasoning, prompt: smartscrape_prompt });
// // // Call the smartScrape function (which needs to be implemented/imported)
// // // const smartScrapedDocs = await smartScrape(meta.url, smartscrape_prompt);
// // // const smartScrapedDocs = await smartScrape(meta.rewrittenUrl ?? meta.url, smartscrape_prompt);
// // // Process/merge smartScrapedDocs with extractedData
// // // ... potentially update finalExtract ...
// // } else {