mirror of
https://github.com/mendableai/firecrawl.git
synced 2025-06-27 00:41:33 +00:00
feat(scrape): support Google Docs (FIR-1365) (#1686)
* feat(scrape): support Google Docs * fixes
This commit is contained in:
parent
f8983fffb7
commit
f939428264
@ -745,6 +745,15 @@ describe("Scrape tests", () => {
|
||||
// text on the last page
|
||||
expect(response.markdown).toContain("Redistribution and use in source and binary forms, with or without modification");
|
||||
}, 310000);
|
||||
|
||||
it.concurrent("scrapes Google Docs links as PDFs", async () => {
|
||||
const response = await scrape({
|
||||
url: "https://docs.google.com/document/d/1H-hOLYssS8xXl2o5hxj4ipE7yyhZAX1s7ADYM1Hdlzo/view",
|
||||
timeout: 300000,
|
||||
});
|
||||
|
||||
expect(response.markdown).toContain("This is a test to confirm Google Docs scraping abilities.");
|
||||
}, 310000);
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -4,7 +4,7 @@ import { Meta } from "../..";
|
||||
import { EngineError, IndexMissError } from "../../error";
|
||||
|
||||
export async function scrapeCache(meta: Meta): Promise<EngineScrapeResult> {
|
||||
const key = cacheKey(meta.url, meta.options, meta.internalOptions);
|
||||
const key = cacheKey(meta.rewrittenUrl ?? meta.url, meta.options, meta.internalOptions);
|
||||
if (key === null) throw new EngineError("Scrape not eligible for caching");
|
||||
|
||||
const entry = await getEntryFromCache(key);
|
||||
|
@ -4,7 +4,7 @@ import { downloadFile } from "../utils/downloadFile";
|
||||
import mammoth from "mammoth";
|
||||
|
||||
export async function scrapeDOCX(meta: Meta, timeToRun: number | undefined): Promise<EngineScrapeResult> {
|
||||
const { response, tempFilePath } = await downloadFile(meta.id, meta.url, {
|
||||
const { response, tempFilePath } = await downloadFile(meta.id, meta.rewrittenUrl ?? meta.url, {
|
||||
headers: meta.options.headers,
|
||||
signal: meta.internalOptions.abort ?? AbortSignal.timeout(timeToRun ?? 300000),
|
||||
});
|
||||
|
@ -17,7 +17,7 @@ export async function scrapeURLWithFetch(
|
||||
const timeout = timeToRun ?? 300000;
|
||||
|
||||
const mockOptions = {
|
||||
url: meta.url,
|
||||
url: meta.rewrittenUrl ?? meta.url,
|
||||
|
||||
// irrelevant
|
||||
method: "GET",
|
||||
@ -55,8 +55,8 @@ export async function scrapeURLWithFetch(
|
||||
} else {
|
||||
try {
|
||||
const x = await Promise.race([
|
||||
undici.fetch(meta.url, {
|
||||
dispatcher: await makeSecureDispatcher(meta.url),
|
||||
undici.fetch(meta.rewrittenUrl ?? meta.url, {
|
||||
dispatcher: await makeSecureDispatcher(meta.rewrittenUrl ?? meta.url),
|
||||
redirect: "follow",
|
||||
headers: meta.options.headers,
|
||||
signal: meta.internalOptions.abort ?? AbortSignal.timeout(timeout),
|
||||
|
@ -209,7 +209,7 @@ export async function scrapeURLWithFireEngineChromeCDP(
|
||||
|
||||
const request: FireEngineScrapeRequestCommon &
|
||||
FireEngineScrapeRequestChromeCDP = {
|
||||
url: meta.url,
|
||||
url: meta.rewrittenUrl ?? meta.url,
|
||||
engine: "chrome-cdp",
|
||||
instantReturn: true,
|
||||
skipTlsVerification: meta.options.skipTlsVerification,
|
||||
@ -298,7 +298,7 @@ export async function scrapeURLWithFireEnginePlaywright(
|
||||
|
||||
const request: FireEngineScrapeRequestCommon &
|
||||
FireEngineScrapeRequestPlaywright = {
|
||||
url: meta.url,
|
||||
url: meta.rewrittenUrl ?? meta.url,
|
||||
engine: "playwright",
|
||||
instantReturn: true,
|
||||
|
||||
@ -359,7 +359,7 @@ export async function scrapeURLWithFireEngineTLSClient(
|
||||
|
||||
const request: FireEngineScrapeRequestCommon &
|
||||
FireEngineScrapeRequestTLSClient = {
|
||||
url: meta.url,
|
||||
url: meta.rewrittenUrl ?? meta.url,
|
||||
engine: "tlsclient",
|
||||
instantReturn: true,
|
||||
|
||||
|
@ -87,7 +87,7 @@ export async function sendDocumentToIndex(meta: Meta, document: Document) {
|
||||
url: normalizedURL,
|
||||
url_hash: urlHash,
|
||||
original_url: document.metadata.sourceURL ?? meta.url,
|
||||
resolved_url: document.metadata.url ?? document.metadata.sourceURL ?? meta.url,
|
||||
resolved_url: document.metadata.url ?? document.metadata.sourceURL ?? meta.rewrittenUrl ?? meta.url,
|
||||
has_screenshot: document.screenshot !== undefined && meta.featureFlags.has("screenshot"),
|
||||
has_screenshot_fullscreen: document.screenshot !== undefined && meta.featureFlags.has("screenshot@fullScreen"),
|
||||
is_mobile: meta.options.mobile,
|
||||
|
@ -181,14 +181,14 @@ export async function scrapePDF(
|
||||
"base64",
|
||||
);
|
||||
return {
|
||||
url: meta.pdfPrefetch.url ?? meta.url,
|
||||
url: meta.pdfPrefetch.url ?? meta.rewrittenUrl ?? meta.url,
|
||||
statusCode: meta.pdfPrefetch.status,
|
||||
|
||||
html: content,
|
||||
markdown: content,
|
||||
};
|
||||
} else {
|
||||
const file = await fetchFileToBuffer(meta.url, {
|
||||
const file = await fetchFileToBuffer(meta.rewrittenUrl ?? meta.url, {
|
||||
headers: meta.options.headers,
|
||||
});
|
||||
|
||||
@ -212,7 +212,7 @@ export async function scrapePDF(
|
||||
const { response, tempFilePath } =
|
||||
meta.pdfPrefetch !== undefined && meta.pdfPrefetch !== null
|
||||
? { response: meta.pdfPrefetch, tempFilePath: meta.pdfPrefetch.filePath }
|
||||
: await downloadFile(meta.id, meta.url, {
|
||||
: await downloadFile(meta.id, meta.rewrittenUrl ?? meta.url, {
|
||||
headers: meta.options.headers,
|
||||
});
|
||||
|
||||
@ -298,7 +298,7 @@ export async function scrapePDF(
|
||||
await unlink(tempFilePath);
|
||||
|
||||
return {
|
||||
url: response.url ?? meta.url,
|
||||
url: response.url ?? meta.rewrittenUrl ?? meta.url,
|
||||
statusCode: response.status,
|
||||
html: result?.html ?? "",
|
||||
markdown: result?.markdown ?? "",
|
||||
|
@ -18,7 +18,7 @@ export async function scrapeURLWithPlaywright(
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: {
|
||||
url: meta.url,
|
||||
url: meta.rewrittenUrl ?? meta.url,
|
||||
wait_after_load: meta.options.waitFor,
|
||||
timeout,
|
||||
headers: meta.options.headers,
|
||||
@ -48,7 +48,7 @@ export async function scrapeURLWithPlaywright(
|
||||
}
|
||||
|
||||
return {
|
||||
url: meta.url, // TODO: impove redirect following
|
||||
url: meta.rewrittenUrl ?? meta.url, // TODO: impove redirect following
|
||||
html: response.content,
|
||||
statusCode: response.pageStatusCode,
|
||||
error: response.pageError,
|
||||
|
@ -48,6 +48,7 @@ export type ScrapeUrlResponse = (
|
||||
export type Meta = {
|
||||
id: string;
|
||||
url: string;
|
||||
rewrittenUrl?: string;
|
||||
options: ScrapeOptions;
|
||||
internalOptions: InternalOptions;
|
||||
logger: Logger;
|
||||
@ -156,9 +157,18 @@ async function buildMetaObject(
|
||||
});
|
||||
const logs: any[] = [];
|
||||
|
||||
let rewrittenUrl: string | undefined;
|
||||
if (url.startsWith("https://docs.google.com/document/d/") || url.startsWith("http://docs.google.com/document/d/")) {
|
||||
const id = url.match(/\/document\/d\/([-\w]+)/)?.[1];
|
||||
if (id) {
|
||||
rewrittenUrl = `https://docs.google.com/document/d/${id}/export?format=pdf`;
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
id,
|
||||
url,
|
||||
rewrittenUrl,
|
||||
options,
|
||||
internalOptions,
|
||||
logger,
|
||||
@ -233,7 +243,7 @@ function safeguardCircularError<T>(error: T): T {
|
||||
}
|
||||
|
||||
async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
|
||||
meta.logger.info(`Scraping URL ${JSON.stringify(meta.url)}...`);
|
||||
meta.logger.info(`Scraping URL ${JSON.stringify(meta.rewrittenUrl ?? meta.url)}...`);
|
||||
|
||||
// TODO: handle sitemap data, see WebScraper/index.ts:280
|
||||
// TODO: ScrapeEvents
|
||||
@ -441,6 +451,11 @@ export async function scrapeURL(
|
||||
costTracking: CostTracking,
|
||||
): Promise<ScrapeUrlResponse> {
|
||||
const meta = await buildMetaObject(id, url, options, internalOptions, costTracking);
|
||||
|
||||
if (meta.rewrittenUrl) {
|
||||
meta.logger.info("Rewriting URL");
|
||||
}
|
||||
|
||||
try {
|
||||
while (true) {
|
||||
try {
|
||||
|
@ -12,7 +12,7 @@ export async function extractMetadataRust(
|
||||
return {
|
||||
...fromRust,
|
||||
...(fromRust.favicon ? {
|
||||
favicon: new URL(fromRust.favicon, meta.url)
|
||||
favicon: new URL(fromRust.favicon, meta.rewrittenUrl ?? meta.url)
|
||||
} : {}),
|
||||
scrapeId: meta.id,
|
||||
};
|
||||
@ -75,7 +75,7 @@ export async function extractMetadata(
|
||||
soup('link[rel*="icon"]').first().attr("href") ||
|
||||
undefined;
|
||||
if (faviconLink) {
|
||||
const baseUrl = new URL(meta.url).origin;
|
||||
const baseUrl = new URL(meta.rewrittenUrl ?? meta.url).origin;
|
||||
favicon = faviconLink.startsWith("http")
|
||||
? faviconLink
|
||||
: `${baseUrl}${faviconLink}`;
|
||||
|
@ -63,12 +63,12 @@ export async function deriveDiff(meta: Meta, document: Document): Promise<Docume
|
||||
const res = await supabase_service
|
||||
.rpc("diff_get_last_scrape_4", {
|
||||
i_team_id: meta.internalOptions.teamId,
|
||||
i_url: document.metadata.sourceURL ?? meta.url,
|
||||
i_url: document.metadata.sourceURL ?? meta.rewrittenUrl ?? meta.url,
|
||||
i_tag: meta.options.changeTrackingOptions?.tag ?? null,
|
||||
});
|
||||
const end = Date.now();
|
||||
if (end - start > 100) {
|
||||
meta.logger.debug("Diffing took a while", { time: end - start, params: { i_team_id: meta.internalOptions.teamId, i_url: document.metadata.sourceURL ?? meta.url } });
|
||||
meta.logger.debug("Diffing took a while", { time: end - start, params: { i_team_id: meta.internalOptions.teamId, i_url: document.metadata.sourceURL ?? meta.rewrittenUrl ?? meta.url } });
|
||||
}
|
||||
|
||||
const data: {
|
||||
|
@ -48,7 +48,7 @@ export async function deriveHTMLFromRawHTML(
|
||||
|
||||
document.html = await htmlTransform(
|
||||
document.rawHtml,
|
||||
document.metadata.url ?? document.metadata.sourceURL ?? meta.url,
|
||||
document.metadata.url ?? document.metadata.sourceURL ?? meta.rewrittenUrl ?? meta.url,
|
||||
meta.options,
|
||||
);
|
||||
return document;
|
||||
@ -88,7 +88,7 @@ export async function deriveLinksFromHTML(meta: Meta, document: Document): Promi
|
||||
);
|
||||
}
|
||||
|
||||
document.links = await extractLinks(document.html, meta.url);
|
||||
document.links = await extractLinks(document.html, document.metadata.url ?? document.metadata.sourceURL ?? meta.rewrittenUrl ?? meta.url);
|
||||
}
|
||||
|
||||
return document;
|
||||
|
@ -684,7 +684,7 @@ export async function performLLMExtract(
|
||||
const { extractedDataArray, warning, costLimitExceededTokenUsage } =
|
||||
await extractData({
|
||||
extractOptions: generationOptions,
|
||||
urls: [meta.url],
|
||||
urls: [meta.rewrittenUrl ?? meta.url],
|
||||
useAgent: false,
|
||||
scrapeId: meta.id,
|
||||
});
|
||||
@ -760,7 +760,7 @@ export async function performLLMExtract(
|
||||
// // if (shouldUseSmartscrape && smartscrape_prompt) {
|
||||
// // meta.logger.info("Triggering SmartScrape refinement...", { reason: smartscrape_reasoning, prompt: smartscrape_prompt });
|
||||
// // // Call the smartScrape function (which needs to be implemented/imported)
|
||||
// // // const smartScrapedDocs = await smartScrape(meta.url, smartscrape_prompt);
|
||||
// // // const smartScrapedDocs = await smartScrape(meta.rewrittenUrl ?? meta.url, smartscrape_prompt);
|
||||
// // // Process/merge smartScrapedDocs with extractedData
|
||||
// // // ... potentially update finalExtract ...
|
||||
// // } else {
|
||||
|
Loading…
x
Reference in New Issue
Block a user