From 446acfccde8c7304b99fb75d129d789308b0a13e Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 31 Oct 2024 20:01:52 -0300 Subject: [PATCH 1/4] Nick: support for the new actions --- apps/api/src/controllers/v1/types.ts | 4 ++++ apps/api/src/lib/entities.ts | 13 +++++++++++-- .../src/scraper/WebScraper/scrapers/fireEngine.ts | 2 ++ apps/api/src/scraper/WebScraper/single_url.ts | 5 ++++- 4 files changed, 21 insertions(+), 3 deletions(-) diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 633bbdf16..229893d1c 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -62,6 +62,7 @@ export const actionsSchema = z.array(z.union([ z.object({ type: z.literal("wait"), milliseconds: z.number().int().positive().finite(), + selector: z.string().optional(), }), z.object({ type: z.literal("click"), @@ -83,6 +84,9 @@ export const actionsSchema = z.array(z.union([ type: z.literal("scroll"), direction: z.enum(["up", "down"]), }), + z.object({ + type: z.literal("scrape"), + }), ])); export const scrapeOptions = z.object({ diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 8aa1d004a..81bca5719 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -12,7 +12,8 @@ export interface Progress { export type Action = { type: "wait", - milliseconds: number, + milliseconds?: number, + selector?: string, } | { type: "click", selector: string, @@ -28,7 +29,9 @@ export type Action = { } | { type: "scroll", direction: "up" | "down" -}; +} | { + type: "scrape", +} export type PageOptions = { includeMarkdown?: boolean; @@ -163,11 +166,17 @@ export class SearchResult { } } +export interface ScrapeActionContent { + url: string; + html: string; +} + export interface FireEngineResponse { html: string; screenshots?: string[]; pageStatusCode?: number; pageError?: string; + scrapeActionContent?: ScrapeActionContent[]; } diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts index 3bbd74eb1..7332874f2 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -200,11 +200,13 @@ export async function scrapWithFireEngine({ logParams.html = data.content ?? ""; logParams.response_code = data.pageStatusCode; logParams.error_message = data.pageError ?? data.error; + return { html: data.content ?? "", screenshots: data.screenshots ?? [data.screenshot] ?? [], pageStatusCode: data.pageStatusCode, pageError: data.pageError ?? data.error, + scrapeActionContent: data?.actionContent ?? [], }; } } catch (error) { diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index c7185b798..611a7b5c3 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -21,6 +21,7 @@ import { extractLinks } from "./utils/utils"; import { Logger } from "../../lib/logger"; import { ScrapeEvents } from "../../lib/scrape-events"; import { clientSideError } from "../../strings"; +import { ScrapeActionContent } from "../../lib/entities"; dotenv.config(); @@ -180,7 +181,8 @@ export async function scrapSingleUrl( text: string; screenshot: string; actions?: { - screenshots: string[]; + screenshots?: string[]; + scrapes?: ScrapeActionContent[]; }; metadata: { pageStatusCode?: number; pageError?: string | null }; } = { text: "", screenshot: "", metadata: {} }; @@ -259,6 +261,7 @@ export async function scrapSingleUrl( if (pageOptions.actions) { scraperResponse.actions = { screenshots: response.screenshots ?? [], + scrapes: response.scrapeActionContent ?? [], }; } scraperResponse.metadata.pageStatusCode = response.pageStatusCode; From 80beedb46f281367019c784934093f25912fbc1a Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 31 Oct 2024 20:03:19 -0300 Subject: [PATCH 2/4] Update index.ts --- apps/js-sdk/firecrawl/src/index.ts | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index bbe934fed..7ad5a5f0f 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -90,7 +90,8 @@ export interface CrawlScrapeOptions { export type Action = { type: "wait", - milliseconds: number, + milliseconds?: number, + selector?: string, } | { type: "click", selector: string, @@ -106,6 +107,8 @@ export type Action = { } | { type: "scroll", direction: "up" | "down", +} | { + type: "scrape", }; export interface ScrapeParams extends CrawlScrapeOptions { From 28db4dd3b587879083f947c7e17f6610ba386624 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 1 Nov 2024 12:40:40 -0300 Subject: [PATCH 3/4] fixed zod validation for wait --- apps/api/src/controllers/v1/types.ts | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 229893d1c..45aa3c9bd 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -61,9 +61,14 @@ export type ExtractOptions = z.infer; export const actionsSchema = z.array(z.union([ z.object({ type: z.literal("wait"), - milliseconds: z.number().int().positive().finite(), + milliseconds: z.number().int().positive().finite().optional(), selector: z.string().optional(), - }), + }).refine( + (data) => (data.milliseconds !== undefined || data.selector !== undefined) && !(data.milliseconds !== undefined && data.selector !== undefined), + { + message: "Either 'milliseconds' or 'selector' must be provided, but not both.", + } + ), z.object({ type: z.literal("click"), selector: z.string(), From 61f659190deb77cc06b5e473e6779d5e9ea2b32e Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 1 Nov 2024 12:44:29 -0300 Subject: [PATCH 4/4] bumped js-sdk version --- apps/js-sdk/firecrawl/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index b8738e5eb..9b23077a4 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.7.2", + "version": "1.7.3", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts",