mirror of
https://github.com/mendableai/firecrawl.git
synced 2025-12-30 00:37:31 +00:00
feat(v1): add public actions api
This commit is contained in:
parent
42d677fe3c
commit
093c064bff
@ -1,7 +1,7 @@
|
||||
import { Request, Response } from "express";
|
||||
import { z } from "zod";
|
||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
||||
import { ExtractorOptions, PageOptions } from "../../lib/entities";
|
||||
import { Action, ExtractorOptions, PageOptions } from "../../lib/entities";
|
||||
import { protocolIncluded, checkUrl } from "../../lib/validateUrl";
|
||||
import { PlanType } from "../../types";
|
||||
|
||||
@ -57,6 +57,21 @@ export const extractOptions = z.object({
|
||||
|
||||
export type ExtractOptions = z.infer<typeof extractOptions>;
|
||||
|
||||
export const actionsSchema = z.array(z.union([
|
||||
z.object({
|
||||
type: z.literal("wait"),
|
||||
milliseconds: z.number().int().positive().finite(),
|
||||
}),
|
||||
z.object({
|
||||
type: z.literal("click"),
|
||||
selector: z.string(),
|
||||
}),
|
||||
z.object({
|
||||
type: z.literal("screenshot"),
|
||||
fullPage: z.boolean().default(false),
|
||||
}),
|
||||
]));
|
||||
|
||||
export const scrapeOptions = z.object({
|
||||
formats: z
|
||||
.enum([
|
||||
@ -80,6 +95,7 @@ export const scrapeOptions = z.object({
|
||||
waitFor: z.number().int().nonnegative().finite().safe().default(0),
|
||||
extract: extractOptions.optional(),
|
||||
parsePDF: z.boolean().default(true),
|
||||
actions: actionsSchema.optional(),
|
||||
}).strict(strictMessage)
|
||||
|
||||
|
||||
@ -185,6 +201,9 @@ export type Document = {
|
||||
rawHtml?: string;
|
||||
links?: string[];
|
||||
screenshot?: string;
|
||||
actions?: {
|
||||
screenshots: string[];
|
||||
};
|
||||
metadata: {
|
||||
title?: string;
|
||||
description?: string;
|
||||
@ -336,6 +355,7 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
|
||||
screenshot: x.formats.includes("screenshot"),
|
||||
fullPageScreenshot: x.formats.includes("screenshot@fullPage"),
|
||||
parsePDF: x.parsePDF,
|
||||
actions: x.actions as Action[], // no strict null checking grrrr - mogery
|
||||
};
|
||||
}
|
||||
|
||||
@ -370,6 +390,7 @@ export function legacyDocumentConverter(doc: any): Document {
|
||||
html: doc.html,
|
||||
extract: doc.llm_extraction,
|
||||
screenshot: doc.screenshot ?? doc.fullPageScreenshot,
|
||||
actions: doc.actions ?? undefined,
|
||||
metadata: {
|
||||
...doc.metadata,
|
||||
pageError: undefined,
|
||||
|
||||
@ -110,6 +110,9 @@ export class Document {
|
||||
childrenLinks?: string[];
|
||||
provider?: string;
|
||||
warning?: string;
|
||||
actions?: {
|
||||
screenshots: string[];
|
||||
}
|
||||
|
||||
index?: number;
|
||||
linksOnPage?: string[]; // Add this new field as a separate property
|
||||
@ -149,7 +152,7 @@ export class SearchResult {
|
||||
|
||||
export interface FireEngineResponse {
|
||||
html: string;
|
||||
screenshot: string;
|
||||
screenshots?: string[];
|
||||
pageStatusCode?: number;
|
||||
pageError?: string;
|
||||
}
|
||||
|
||||
@ -136,7 +136,7 @@ export async function scrapWithFireEngine({
|
||||
|
||||
Logger.debug(`⛏️ Fire-Engine (${engine}): Request timed out for ${url}`);
|
||||
logParams.error_message = "Request timed out";
|
||||
return { html: "", screenshot: "", pageStatusCode: null, pageError: "" };
|
||||
return { html: "", pageStatusCode: null, pageError: "" };
|
||||
}
|
||||
|
||||
if (checkStatusResponse.status !== 200 || checkStatusResponse.data.error) {
|
||||
@ -155,7 +155,6 @@ export async function scrapWithFireEngine({
|
||||
|
||||
return {
|
||||
html: "",
|
||||
screenshot: "",
|
||||
pageStatusCode,
|
||||
pageError: checkStatusResponse.data?.pageError ?? checkStatusResponse.data?.error,
|
||||
};
|
||||
@ -171,7 +170,7 @@ export async function scrapWithFireEngine({
|
||||
logParams.success = true;
|
||||
logParams.response_code = pageStatusCode;
|
||||
logParams.error_message = pageError;
|
||||
return { html: content, screenshot: "", pageStatusCode, pageError };
|
||||
return { html: content, pageStatusCode, pageError };
|
||||
} else {
|
||||
const data = checkStatusResponse.data;
|
||||
|
||||
@ -183,7 +182,7 @@ export async function scrapWithFireEngine({
|
||||
logParams.error_message = data.pageError ?? data.error;
|
||||
return {
|
||||
html: data.content ?? "",
|
||||
screenshot: data.screenshot ?? "",
|
||||
screenshots: data.screenshots,
|
||||
pageStatusCode: data.pageStatusCode,
|
||||
pageError: data.pageError ?? data.error,
|
||||
};
|
||||
@ -196,7 +195,7 @@ export async function scrapWithFireEngine({
|
||||
Logger.debug(`⛏️ Fire-Engine: Failed to fetch url: ${url} | Error: ${error}`);
|
||||
logParams.error_message = error.message || error;
|
||||
}
|
||||
return { html: "", screenshot: "", pageStatusCode: null, pageError: logParams.error_message };
|
||||
return { html: "", pageStatusCode: null, pageError: logParams.error_message };
|
||||
} finally {
|
||||
const endTime = Date.now();
|
||||
logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000;
|
||||
|
||||
@ -69,8 +69,13 @@ function getScrapingFallbackOrder(
|
||||
defaultScraper?: string,
|
||||
isWaitPresent: boolean = false,
|
||||
isScreenshotPresent: boolean = false,
|
||||
isHeadersPresent: boolean = false
|
||||
isHeadersPresent: boolean = false,
|
||||
isActionsPresent: boolean = false,
|
||||
) {
|
||||
if (isActionsPresent) {
|
||||
return useFireEngine ? ["fire-engine;chrome-cdp"] : [];
|
||||
}
|
||||
|
||||
const availableScrapers = baseScrapers.filter((scraper) => {
|
||||
switch (scraper) {
|
||||
case "scrapingBee":
|
||||
@ -170,6 +175,9 @@ export async function scrapSingleUrl(
|
||||
let scraperResponse: {
|
||||
text: string;
|
||||
screenshot: string;
|
||||
actions?: {
|
||||
screenshots: string[];
|
||||
};
|
||||
metadata: { pageStatusCode?: number; pageError?: string | null };
|
||||
} = { text: "", screenshot: "", metadata: {} };
|
||||
let screenshot = "";
|
||||
@ -217,7 +225,14 @@ export async function scrapSingleUrl(
|
||||
teamId,
|
||||
});
|
||||
scraperResponse.text = response.html;
|
||||
scraperResponse.screenshot = response.screenshot;
|
||||
if (pageOptions.screenshot || pageOptions.fullPageScreenshot) {
|
||||
scraperResponse.screenshot = (response.screenshots ?? []).splice(0, 1)[0] ?? "";
|
||||
}
|
||||
if (pageOptions.actions) {
|
||||
scraperResponse.actions = {
|
||||
screenshots: response.screenshots ?? [],
|
||||
};
|
||||
}
|
||||
scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
|
||||
scraperResponse.metadata.pageError = response.pageError;
|
||||
}
|
||||
@ -283,9 +298,6 @@ export async function scrapSingleUrl(
|
||||
]) : ([]),
|
||||
pageOptions: customScraperResult.pageOptions,
|
||||
});
|
||||
if (screenshot) {
|
||||
customScrapedContent.screenshot = screenshot;
|
||||
}
|
||||
break;
|
||||
case "pdf":
|
||||
const { content, pageStatusCode, pageError } =
|
||||
@ -295,7 +307,6 @@ export async function scrapSingleUrl(
|
||||
);
|
||||
customScrapedContent = {
|
||||
html: content,
|
||||
screenshot,
|
||||
pageStatusCode,
|
||||
pageError,
|
||||
};
|
||||
@ -305,7 +316,6 @@ export async function scrapSingleUrl(
|
||||
|
||||
if (customScrapedContent) {
|
||||
scraperResponse.text = customScrapedContent.html;
|
||||
screenshot = customScrapedContent.screenshot;
|
||||
}
|
||||
//* TODO: add an optional to return markdown or structured/extracted content
|
||||
let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions);
|
||||
@ -325,16 +335,18 @@ export async function scrapSingleUrl(
|
||||
html: cleanedHtml,
|
||||
rawHtml: scraperResponse.text,
|
||||
screenshot: scraperResponse.screenshot,
|
||||
actions: scraperResponse.actions,
|
||||
pageStatusCode: scraperResponse.metadata.pageStatusCode,
|
||||
pageError: scraperResponse.metadata.pageError || undefined,
|
||||
};
|
||||
};
|
||||
|
||||
let { text, html, rawHtml, screenshot, pageStatusCode, pageError } = {
|
||||
let { text, html, rawHtml, screenshot, actions, pageStatusCode, pageError } = {
|
||||
text: "",
|
||||
html: "",
|
||||
rawHtml: "",
|
||||
screenshot: "",
|
||||
actions: undefined,
|
||||
pageStatusCode: 200,
|
||||
pageError: undefined,
|
||||
};
|
||||
@ -350,7 +362,8 @@ export async function scrapSingleUrl(
|
||||
defaultScraper,
|
||||
pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0,
|
||||
pageOptions && (pageOptions.screenshot || pageOptions.fullPageScreenshot) && (pageOptions.screenshot === true || pageOptions.fullPageScreenshot === true),
|
||||
pageOptions && pageOptions.headers && pageOptions.headers !== undefined
|
||||
pageOptions && pageOptions.headers && pageOptions.headers !== undefined,
|
||||
pageOptions && Array.isArray(pageOptions.actions) && pageOptions.actions.length > 0,
|
||||
);
|
||||
|
||||
for (const scraper of scrapersInOrder) {
|
||||
@ -367,6 +380,7 @@ export async function scrapSingleUrl(
|
||||
html = attempt.html ?? "";
|
||||
rawHtml = attempt.rawHtml ?? "";
|
||||
screenshot = attempt.screenshot ?? "";
|
||||
actions = attempt.actions ?? undefined;
|
||||
|
||||
if (attempt.pageStatusCode) {
|
||||
pageStatusCode = attempt.pageStatusCode;
|
||||
@ -404,45 +418,27 @@ export async function scrapSingleUrl(
|
||||
linksOnPage = extractLinks(rawHtml, urlToScrap);
|
||||
}
|
||||
|
||||
let document: Document;
|
||||
if (screenshot && screenshot.length > 0) {
|
||||
document = {
|
||||
content: text,
|
||||
markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? text : undefined,
|
||||
html: pageOptions.includeHtml ? html : undefined,
|
||||
rawHtml:
|
||||
pageOptions.includeRawHtml ||
|
||||
(extractorOptions?.mode === "llm-extraction-from-raw-html" && pageOptions.includeExtract)
|
||||
? rawHtml
|
||||
: undefined,
|
||||
linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined,
|
||||
metadata: {
|
||||
...metadata,
|
||||
screenshot: screenshot,
|
||||
sourceURL: urlToScrap,
|
||||
pageStatusCode: pageStatusCode,
|
||||
pageError: pageError,
|
||||
},
|
||||
};
|
||||
} else {
|
||||
document = {
|
||||
content: text,
|
||||
markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? text : undefined,
|
||||
html: pageOptions.includeHtml ? html : undefined,
|
||||
rawHtml:
|
||||
pageOptions.includeRawHtml ||
|
||||
(extractorOptions?.mode === "llm-extraction-from-raw-html" && pageOptions.includeExtract)
|
||||
? rawHtml
|
||||
: undefined,
|
||||
metadata: {
|
||||
...metadata,
|
||||
sourceURL: urlToScrap,
|
||||
pageStatusCode: pageStatusCode,
|
||||
pageError: pageError,
|
||||
},
|
||||
linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined,
|
||||
};
|
||||
}
|
||||
let document: Document = {
|
||||
content: text,
|
||||
markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? text : undefined,
|
||||
html: pageOptions.includeHtml ? html : undefined,
|
||||
rawHtml:
|
||||
pageOptions.includeRawHtml ||
|
||||
(extractorOptions?.mode === "llm-extraction-from-raw-html" && pageOptions.includeExtract)
|
||||
? rawHtml
|
||||
: undefined,
|
||||
linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined,
|
||||
actions,
|
||||
metadata: {
|
||||
...metadata,
|
||||
...(screenshot && screenshot.length > 0 ? ({
|
||||
screenshot,
|
||||
}) : {}),
|
||||
sourceURL: urlToScrap,
|
||||
pageStatusCode: pageStatusCode,
|
||||
pageError: pageError,
|
||||
},
|
||||
};
|
||||
|
||||
return document;
|
||||
} catch (error) {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user