feat(scrape): support Google Docs

This commit is contained in:
Gergő Móricz 2025-06-20 11:12:05 +02:00
parent f8983fffb7
commit 3cf22f9167
8 changed files with 36 additions and 12 deletions

View File

@ -745,6 +745,15 @@ describe("Scrape tests", () => {
// text on the last page
expect(response.markdown).toContain("Redistribution and use in source and binary forms, with or without modification");
}, 310000);
it.concurrent("scrapes Google Docs links as PDFs", async () => {
const response = await scrape({
url: "https://docs.google.com/document/d/1H-hOLYssS8xXl2o5hxj4ipE7yyhZAX1s7ADYM1Hdlzo/view",
timeout: 300000,
});
expect(response.markdown).toContain("This is a test to confirm Google Docs scraping abilities.");
}, 310000);
});
}

View File

@ -4,7 +4,7 @@ import { Meta } from "../..";
import { EngineError, IndexMissError } from "../../error";
export async function scrapeCache(meta: Meta): Promise<EngineScrapeResult> {
const key = cacheKey(meta.url, meta.options, meta.internalOptions);
const key = cacheKey(meta.rewrittenUrl ?? meta.url, meta.options, meta.internalOptions);
if (key === null) throw new EngineError("Scrape not eligible for caching");
const entry = await getEntryFromCache(key);

View File

@ -4,7 +4,7 @@ import { downloadFile } from "../utils/downloadFile";
import mammoth from "mammoth";
export async function scrapeDOCX(meta: Meta, timeToRun: number | undefined): Promise<EngineScrapeResult> {
const { response, tempFilePath } = await downloadFile(meta.id, meta.url, {
const { response, tempFilePath } = await downloadFile(meta.id, meta.rewrittenUrl ?? meta.url, {
headers: meta.options.headers,
signal: meta.internalOptions.abort ?? AbortSignal.timeout(timeToRun ?? 300000),
});

View File

@ -17,7 +17,7 @@ export async function scrapeURLWithFetch(
const timeout = timeToRun ?? 300000;
const mockOptions = {
url: meta.url,
url: meta.rewrittenUrl ?? meta.url,
// irrelevant
method: "GET",

View File

@ -209,7 +209,7 @@ export async function scrapeURLWithFireEngineChromeCDP(
const request: FireEngineScrapeRequestCommon &
FireEngineScrapeRequestChromeCDP = {
url: meta.url,
url: meta.rewrittenUrl ?? meta.url,
engine: "chrome-cdp",
instantReturn: true,
skipTlsVerification: meta.options.skipTlsVerification,
@ -298,7 +298,7 @@ export async function scrapeURLWithFireEnginePlaywright(
const request: FireEngineScrapeRequestCommon &
FireEngineScrapeRequestPlaywright = {
url: meta.url,
url: meta.rewrittenUrl ?? meta.url,
engine: "playwright",
instantReturn: true,
@ -359,7 +359,7 @@ export async function scrapeURLWithFireEngineTLSClient(
const request: FireEngineScrapeRequestCommon &
FireEngineScrapeRequestTLSClient = {
url: meta.url,
url: meta.rewrittenUrl ?? meta.url,
engine: "tlsclient",
instantReturn: true,

View File

@ -181,14 +181,14 @@ export async function scrapePDF(
"base64",
);
return {
url: meta.pdfPrefetch.url ?? meta.url,
url: meta.pdfPrefetch.url ?? meta.rewrittenUrl ?? meta.url,
statusCode: meta.pdfPrefetch.status,
html: content,
markdown: content,
};
} else {
const file = await fetchFileToBuffer(meta.url, {
const file = await fetchFileToBuffer(meta.rewrittenUrl ?? meta.url, {
headers: meta.options.headers,
});
@ -212,7 +212,7 @@ export async function scrapePDF(
const { response, tempFilePath } =
meta.pdfPrefetch !== undefined && meta.pdfPrefetch !== null
? { response: meta.pdfPrefetch, tempFilePath: meta.pdfPrefetch.filePath }
: await downloadFile(meta.id, meta.url, {
: await downloadFile(meta.id, meta.rewrittenUrl ?? meta.url, {
headers: meta.options.headers,
});
@ -298,7 +298,7 @@ export async function scrapePDF(
await unlink(tempFilePath);
return {
url: response.url ?? meta.url,
url: response.url ?? meta.rewrittenUrl ?? meta.url,
statusCode: response.status,
html: result?.html ?? "",
markdown: result?.markdown ?? "",

View File

@ -18,7 +18,7 @@ export async function scrapeURLWithPlaywright(
"Content-Type": "application/json",
},
body: {
url: meta.url,
url: meta.rewrittenUrl ?? meta.url,
wait_after_load: meta.options.waitFor,
timeout,
headers: meta.options.headers,
@ -48,7 +48,7 @@ export async function scrapeURLWithPlaywright(
}
return {
url: meta.url, // TODO: impove redirect following
url: meta.rewrittenUrl ?? meta.url, // TODO: impove redirect following
html: response.content,
statusCode: response.pageStatusCode,
error: response.pageError,

View File

@ -48,6 +48,7 @@ export type ScrapeUrlResponse = (
export type Meta = {
id: string;
url: string;
rewrittenUrl?: string;
options: ScrapeOptions;
internalOptions: InternalOptions;
logger: Logger;
@ -156,9 +157,18 @@ async function buildMetaObject(
});
const logs: any[] = [];
let rewrittenUrl: string | undefined;
if (url.startsWith("https://docs.google.com/document/d/") || url.startsWith("http://docs.google.com/document/d/")) {
const id = url.match(/\/document\/d\/([-\w]+)/)?.[1];
if (id) {
rewrittenUrl = `https://docs.google.com/document/d/${id}/export?format=pdf`;
}
}
return {
id,
url,
rewrittenUrl,
options,
internalOptions,
logger,
@ -441,6 +451,11 @@ export async function scrapeURL(
costTracking: CostTracking,
): Promise<ScrapeUrlResponse> {
const meta = await buildMetaObject(id, url, options, internalOptions, costTracking);
if (meta.rewrittenUrl) {
meta.logger.info("Rewriting URL");
}
try {
while (true) {
try {