2024-04-15 17:01:47 -04:00
|
|
|
export interface Progress {
|
|
|
|
current: number;
|
|
|
|
total: number;
|
|
|
|
status: string;
|
|
|
|
metadata?: {
|
|
|
|
sourceURL?: string;
|
|
|
|
[key: string]: any;
|
|
|
|
};
|
|
|
|
currentDocumentUrl?: string;
|
2024-05-04 12:30:12 -07:00
|
|
|
currentDocument?: Document;
|
2024-04-15 17:01:47 -04:00
|
|
|
}
|
|
|
|
|
2024-09-18 20:04:54 +02:00
|
|
|
export type Action = {
|
|
|
|
type: "wait",
|
|
|
|
milliseconds: number,
|
|
|
|
} | {
|
|
|
|
type: "click",
|
|
|
|
selector: string,
|
|
|
|
} | {
|
|
|
|
type: "screenshot",
|
|
|
|
fullPage?: boolean,
|
|
|
|
};
|
|
|
|
|
2024-04-17 18:24:46 -07:00
|
|
|
export type PageOptions = {
|
2024-08-16 17:57:11 -03:00
|
|
|
includeMarkdown?: boolean;
|
2024-08-29 20:08:06 -03:00
|
|
|
includeExtract?: boolean;
|
2024-04-17 18:24:46 -07:00
|
|
|
onlyMainContent?: boolean;
|
2024-05-07 13:40:24 -03:00
|
|
|
includeHtml?: boolean;
|
2024-06-28 17:07:47 -04:00
|
|
|
includeRawHtml?: boolean;
|
2024-04-23 15:28:32 -07:00
|
|
|
fallback?: boolean;
|
|
|
|
fetchPageContent?: boolean;
|
2024-05-28 12:56:24 -07:00
|
|
|
waitFor?: number;
|
2024-05-29 18:56:57 -04:00
|
|
|
screenshot?: boolean;
|
2024-08-05 18:17:37 -03:00
|
|
|
fullPageScreenshot?: boolean;
|
2024-05-31 15:39:54 -07:00
|
|
|
headers?: Record<string, string>;
|
2024-06-11 12:43:16 -07:00
|
|
|
replaceAllPathsWithAbsolutePaths?: boolean;
|
2024-06-14 11:25:20 -03:00
|
|
|
parsePDF?: boolean;
|
2024-06-13 10:51:05 -03:00
|
|
|
removeTags?: string | string[];
|
2024-06-26 21:02:58 -03:00
|
|
|
onlyIncludeTags?: string | string[];
|
2024-08-16 15:14:37 -03:00
|
|
|
includeLinks?: boolean;
|
2024-08-19 16:41:54 -03:00
|
|
|
useFastMode?: boolean; // beta
|
2024-09-05 14:16:31 -03:00
|
|
|
disableJsDom?: boolean; // beta
|
2024-09-18 20:04:54 +02:00
|
|
|
atsv?: boolean; // anti-bot solver, beta
|
|
|
|
actions?: Action[]; // beta
|
2024-04-17 18:24:46 -07:00
|
|
|
};
|
2024-04-23 15:44:11 -07:00
|
|
|
|
2024-04-28 15:52:09 -07:00
|
|
|
export type ExtractorOptions = {
|
2024-06-28 16:39:09 -04:00
|
|
|
mode: "markdown" | "llm-extraction" | "llm-extraction-from-markdown" | "llm-extraction-from-raw-html";
|
2024-04-28 15:52:09 -07:00
|
|
|
extractionPrompt?: string;
|
|
|
|
extractionSchema?: Record<string, any>;
|
2024-08-29 21:00:57 -03:00
|
|
|
userPrompt?: string;
|
2024-04-28 15:52:09 -07:00
|
|
|
}
|
|
|
|
|
2024-04-23 15:44:11 -07:00
|
|
|
export type SearchOptions = {
|
|
|
|
limit?: number;
|
2024-04-23 16:45:06 -07:00
|
|
|
tbs?: string;
|
|
|
|
filter?: string;
|
2024-04-25 01:35:17 +01:00
|
|
|
lang?: string;
|
|
|
|
country?: string;
|
|
|
|
location?: string;
|
2024-04-23 15:44:11 -07:00
|
|
|
};
|
|
|
|
|
2024-06-10 18:12:41 -07:00
|
|
|
export type CrawlerOptions = {
|
|
|
|
returnOnlyUrls?: boolean;
|
2024-08-05 18:13:31 -03:00
|
|
|
includes?: string | string[];
|
|
|
|
excludes?: string | string[];
|
2024-06-10 18:12:41 -07:00
|
|
|
maxCrawledLinks?: number;
|
|
|
|
maxDepth?: number;
|
|
|
|
limit?: number;
|
|
|
|
generateImgAltText?: boolean;
|
|
|
|
replaceAllPathsWithAbsolutePaths?: boolean;
|
|
|
|
ignoreSitemap?: boolean;
|
|
|
|
mode?: "default" | "fast"; // have a mode of some sort
|
2024-06-11 15:24:39 -03:00
|
|
|
allowBackwardCrawling?: boolean;
|
2024-06-28 17:23:40 -07:00
|
|
|
allowExternalContentLinks?: boolean;
|
2024-06-10 18:12:41 -07:00
|
|
|
}
|
|
|
|
|
2024-04-17 18:24:46 -07:00
|
|
|
export type WebScraperOptions = {
|
2024-07-24 14:31:25 +02:00
|
|
|
jobId: string;
|
2024-04-17 18:24:46 -07:00
|
|
|
urls: string[];
|
|
|
|
mode: "single_urls" | "sitemap" | "crawl";
|
2024-06-10 18:12:41 -07:00
|
|
|
crawlerOptions?: CrawlerOptions;
|
2024-04-17 18:24:46 -07:00
|
|
|
pageOptions?: PageOptions;
|
2024-04-28 15:52:09 -07:00
|
|
|
extractorOptions?: ExtractorOptions;
|
2024-04-17 18:24:46 -07:00
|
|
|
concurrentRequests?: number;
|
2024-05-06 17:16:43 -07:00
|
|
|
bullJobId?: string;
|
2024-08-15 19:04:46 +02:00
|
|
|
priority?: number;
|
2024-08-19 16:41:54 -03:00
|
|
|
teamId?: string;
|
2024-04-17 18:24:46 -07:00
|
|
|
};
|
|
|
|
|
2024-04-20 11:59:42 -07:00
|
|
|
export interface DocumentUrl {
|
|
|
|
url: string;
|
|
|
|
}
|
|
|
|
|
2024-04-15 17:01:47 -04:00
|
|
|
export class Document {
|
|
|
|
id?: string;
|
2024-04-23 17:14:34 -07:00
|
|
|
url?: string; // Used only in /search for now
|
2024-04-15 17:01:47 -04:00
|
|
|
content: string;
|
|
|
|
markdown?: string;
|
2024-04-30 09:20:15 -07:00
|
|
|
html?: string;
|
2024-06-28 16:39:09 -04:00
|
|
|
rawHtml?: string;
|
2024-04-29 12:12:55 -07:00
|
|
|
llm_extraction?: Record<string, any>;
|
2024-04-15 17:01:47 -04:00
|
|
|
createdAt?: Date;
|
|
|
|
updatedAt?: Date;
|
|
|
|
type?: string;
|
|
|
|
metadata: {
|
|
|
|
sourceURL?: string;
|
|
|
|
[key: string]: any;
|
|
|
|
};
|
|
|
|
childrenLinks?: string[];
|
2024-04-18 11:43:57 -03:00
|
|
|
provider?: string;
|
2024-05-20 17:07:38 -07:00
|
|
|
warning?: string;
|
2024-04-15 17:01:47 -04:00
|
|
|
|
2024-06-05 11:11:09 -07:00
|
|
|
index?: number;
|
2024-07-16 18:38:03 -07:00
|
|
|
linksOnPage?: string[]; // Add this new field as a separate property
|
|
|
|
|
2024-04-15 17:01:47 -04:00
|
|
|
constructor(data: Partial<Document>) {
|
|
|
|
if (!data.content) {
|
|
|
|
throw new Error("Missing required fields");
|
|
|
|
}
|
|
|
|
this.content = data.content;
|
|
|
|
this.createdAt = data.createdAt || new Date();
|
|
|
|
this.updatedAt = data.updatedAt || new Date();
|
|
|
|
this.type = data.type || "unknown";
|
|
|
|
this.metadata = data.metadata || { sourceURL: "" };
|
|
|
|
this.markdown = data.markdown || "";
|
|
|
|
this.childrenLinks = data.childrenLinks || undefined;
|
2024-04-18 11:43:57 -03:00
|
|
|
this.provider = data.provider || undefined;
|
2024-07-16 18:38:03 -07:00
|
|
|
this.linksOnPage = data.linksOnPage; // Assign linksOnPage if provided
|
2024-04-15 17:01:47 -04:00
|
|
|
}
|
|
|
|
}
|
2024-04-24 10:11:01 -07:00
|
|
|
|
|
|
|
|
|
|
|
export class SearchResult {
|
|
|
|
url: string;
|
|
|
|
title: string;
|
|
|
|
description: string;
|
|
|
|
|
|
|
|
constructor(url: string, title: string, description: string) {
|
|
|
|
this.url = url;
|
|
|
|
this.title = title;
|
|
|
|
this.description = description;
|
|
|
|
}
|
|
|
|
|
|
|
|
toString(): string {
|
|
|
|
return `SearchResult(url=${this.url}, title=${this.title}, description=${this.description})`;
|
|
|
|
}
|
2024-05-29 19:43:51 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
export interface FireEngineResponse {
|
|
|
|
html: string;
|
|
|
|
screenshot: string;
|
2024-06-13 17:08:40 -03:00
|
|
|
pageStatusCode?: number;
|
|
|
|
pageError?: string;
|
|
|
|
}
|
|
|
|
|
2024-07-12 22:02:08 -04:00
|
|
|
|
|
|
|
export interface FireEngineOptions{
|
|
|
|
mobileProxy?: boolean;
|
|
|
|
method?: string;
|
|
|
|
engine?: string;
|
|
|
|
blockMedia?: boolean;
|
|
|
|
blockAds?: boolean;
|
2024-07-26 18:25:44 -04:00
|
|
|
disableJsDom?: boolean;
|
2024-08-19 16:41:54 -03:00
|
|
|
atsv?: boolean; // beta
|
2024-07-12 22:02:08 -04:00
|
|
|
}
|