feat(v1): add public actions api

This commit is contained in:
Gergő Móricz 2024-09-18 20:39:25 +02:00
parent 42d677fe3c
commit 093c064bff
4 changed files with 74 additions and 55 deletions

View File

@ -1,7 +1,7 @@
import { Request, Response } from "express";
import { z } from "zod";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
import { ExtractorOptions, PageOptions } from "../../lib/entities";
import { Action, ExtractorOptions, PageOptions } from "../../lib/entities";
import { protocolIncluded, checkUrl } from "../../lib/validateUrl";
import { PlanType } from "../../types";
@ -57,6 +57,21 @@ export const extractOptions = z.object({
export type ExtractOptions = z.infer<typeof extractOptions>;
export const actionsSchema = z.array(z.union([
z.object({
type: z.literal("wait"),
milliseconds: z.number().int().positive().finite(),
}),
z.object({
type: z.literal("click"),
selector: z.string(),
}),
z.object({
type: z.literal("screenshot"),
fullPage: z.boolean().default(false),
}),
]));
export const scrapeOptions = z.object({
formats: z
.enum([
@ -80,6 +95,7 @@ export const scrapeOptions = z.object({
waitFor: z.number().int().nonnegative().finite().safe().default(0),
extract: extractOptions.optional(),
parsePDF: z.boolean().default(true),
actions: actionsSchema.optional(),
}).strict(strictMessage)
@ -185,6 +201,9 @@ export type Document = {
rawHtml?: string;
links?: string[];
screenshot?: string;
actions?: {
screenshots: string[];
};
metadata: {
title?: string;
description?: string;
@ -336,6 +355,7 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
screenshot: x.formats.includes("screenshot"),
fullPageScreenshot: x.formats.includes("screenshot@fullPage"),
parsePDF: x.parsePDF,
actions: x.actions as Action[], // no strict null checking grrrr - mogery
};
}
@ -370,6 +390,7 @@ export function legacyDocumentConverter(doc: any): Document {
html: doc.html,
extract: doc.llm_extraction,
screenshot: doc.screenshot ?? doc.fullPageScreenshot,
actions: doc.actions ?? undefined,
metadata: {
...doc.metadata,
pageError: undefined,

View File

@ -110,6 +110,9 @@ export class Document {
childrenLinks?: string[];
provider?: string;
warning?: string;
actions?: {
screenshots: string[];
}
index?: number;
linksOnPage?: string[]; // Add this new field as a separate property
@ -149,7 +152,7 @@ export class SearchResult {
export interface FireEngineResponse {
html: string;
screenshot: string;
screenshots?: string[];
pageStatusCode?: number;
pageError?: string;
}

View File

@ -136,7 +136,7 @@ export async function scrapWithFireEngine({
Logger.debug(`⛏️ Fire-Engine (${engine}): Request timed out for ${url}`);
logParams.error_message = "Request timed out";
return { html: "", screenshot: "", pageStatusCode: null, pageError: "" };
return { html: "", pageStatusCode: null, pageError: "" };
}
if (checkStatusResponse.status !== 200 || checkStatusResponse.data.error) {
@ -155,7 +155,6 @@ export async function scrapWithFireEngine({
return {
html: "",
screenshot: "",
pageStatusCode,
pageError: checkStatusResponse.data?.pageError ?? checkStatusResponse.data?.error,
};
@ -171,7 +170,7 @@ export async function scrapWithFireEngine({
logParams.success = true;
logParams.response_code = pageStatusCode;
logParams.error_message = pageError;
return { html: content, screenshot: "", pageStatusCode, pageError };
return { html: content, pageStatusCode, pageError };
} else {
const data = checkStatusResponse.data;
@ -183,7 +182,7 @@ export async function scrapWithFireEngine({
logParams.error_message = data.pageError ?? data.error;
return {
html: data.content ?? "",
screenshot: data.screenshot ?? "",
screenshots: data.screenshots,
pageStatusCode: data.pageStatusCode,
pageError: data.pageError ?? data.error,
};
@ -196,7 +195,7 @@ export async function scrapWithFireEngine({
Logger.debug(`⛏️ Fire-Engine: Failed to fetch url: ${url} | Error: ${error}`);
logParams.error_message = error.message || error;
}
return { html: "", screenshot: "", pageStatusCode: null, pageError: logParams.error_message };
return { html: "", pageStatusCode: null, pageError: logParams.error_message };
} finally {
const endTime = Date.now();
logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000;

View File

@ -69,8 +69,13 @@ function getScrapingFallbackOrder(
defaultScraper?: string,
isWaitPresent: boolean = false,
isScreenshotPresent: boolean = false,
isHeadersPresent: boolean = false
isHeadersPresent: boolean = false,
isActionsPresent: boolean = false,
) {
if (isActionsPresent) {
return useFireEngine ? ["fire-engine;chrome-cdp"] : [];
}
const availableScrapers = baseScrapers.filter((scraper) => {
switch (scraper) {
case "scrapingBee":
@ -170,6 +175,9 @@ export async function scrapSingleUrl(
let scraperResponse: {
text: string;
screenshot: string;
actions?: {
screenshots: string[];
};
metadata: { pageStatusCode?: number; pageError?: string | null };
} = { text: "", screenshot: "", metadata: {} };
let screenshot = "";
@ -217,7 +225,14 @@ export async function scrapSingleUrl(
teamId,
});
scraperResponse.text = response.html;
scraperResponse.screenshot = response.screenshot;
if (pageOptions.screenshot || pageOptions.fullPageScreenshot) {
scraperResponse.screenshot = (response.screenshots ?? []).splice(0, 1)[0] ?? "";
}
if (pageOptions.actions) {
scraperResponse.actions = {
screenshots: response.screenshots ?? [],
};
}
scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
scraperResponse.metadata.pageError = response.pageError;
}
@ -283,9 +298,6 @@ export async function scrapSingleUrl(
]) : ([]),
pageOptions: customScraperResult.pageOptions,
});
if (screenshot) {
customScrapedContent.screenshot = screenshot;
}
break;
case "pdf":
const { content, pageStatusCode, pageError } =
@ -295,7 +307,6 @@ export async function scrapSingleUrl(
);
customScrapedContent = {
html: content,
screenshot,
pageStatusCode,
pageError,
};
@ -305,7 +316,6 @@ export async function scrapSingleUrl(
if (customScrapedContent) {
scraperResponse.text = customScrapedContent.html;
screenshot = customScrapedContent.screenshot;
}
//* TODO: add an optional to return markdown or structured/extracted content
let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions);
@ -325,16 +335,18 @@ export async function scrapSingleUrl(
html: cleanedHtml,
rawHtml: scraperResponse.text,
screenshot: scraperResponse.screenshot,
actions: scraperResponse.actions,
pageStatusCode: scraperResponse.metadata.pageStatusCode,
pageError: scraperResponse.metadata.pageError || undefined,
};
};
let { text, html, rawHtml, screenshot, pageStatusCode, pageError } = {
let { text, html, rawHtml, screenshot, actions, pageStatusCode, pageError } = {
text: "",
html: "",
rawHtml: "",
screenshot: "",
actions: undefined,
pageStatusCode: 200,
pageError: undefined,
};
@ -350,7 +362,8 @@ export async function scrapSingleUrl(
defaultScraper,
pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0,
pageOptions && (pageOptions.screenshot || pageOptions.fullPageScreenshot) && (pageOptions.screenshot === true || pageOptions.fullPageScreenshot === true),
pageOptions && pageOptions.headers && pageOptions.headers !== undefined
pageOptions && pageOptions.headers && pageOptions.headers !== undefined,
pageOptions && Array.isArray(pageOptions.actions) && pageOptions.actions.length > 0,
);
for (const scraper of scrapersInOrder) {
@ -367,6 +380,7 @@ export async function scrapSingleUrl(
html = attempt.html ?? "";
rawHtml = attempt.rawHtml ?? "";
screenshot = attempt.screenshot ?? "";
actions = attempt.actions ?? undefined;
if (attempt.pageStatusCode) {
pageStatusCode = attempt.pageStatusCode;
@ -404,45 +418,27 @@ export async function scrapSingleUrl(
linksOnPage = extractLinks(rawHtml, urlToScrap);
}
let document: Document;
if (screenshot && screenshot.length > 0) {
document = {
content: text,
markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? text : undefined,
html: pageOptions.includeHtml ? html : undefined,
rawHtml:
pageOptions.includeRawHtml ||
(extractorOptions?.mode === "llm-extraction-from-raw-html" && pageOptions.includeExtract)
? rawHtml
: undefined,
linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined,
metadata: {
...metadata,
screenshot: screenshot,
sourceURL: urlToScrap,
pageStatusCode: pageStatusCode,
pageError: pageError,
},
};
} else {
document = {
content: text,
markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? text : undefined,
html: pageOptions.includeHtml ? html : undefined,
rawHtml:
pageOptions.includeRawHtml ||
(extractorOptions?.mode === "llm-extraction-from-raw-html" && pageOptions.includeExtract)
? rawHtml
: undefined,
metadata: {
...metadata,
sourceURL: urlToScrap,
pageStatusCode: pageStatusCode,
pageError: pageError,
},
linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined,
};
}
let document: Document = {
content: text,
markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? text : undefined,
html: pageOptions.includeHtml ? html : undefined,
rawHtml:
pageOptions.includeRawHtml ||
(extractorOptions?.mode === "llm-extraction-from-raw-html" && pageOptions.includeExtract)
? rawHtml
: undefined,
linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined,
actions,
metadata: {
...metadata,
...(screenshot && screenshot.length > 0 ? ({
screenshot,
}) : {}),
sourceURL: urlToScrap,
pageStatusCode: pageStatusCode,
pageError: pageError,
},
};
return document;
} catch (error) {