firecrawl/apps/api/src/lib/entities.ts

144 lines
3.4 KiB
TypeScript
Raw Normal View History

2024-04-15 17:01:47 -04:00
export interface Progress {
current: number;
total: number;
status: string;
metadata?: {
sourceURL?: string;
[key: string]: any;
};
currentDocumentUrl?: string;
2024-05-04 12:30:12 -07:00
currentDocument?: Document;
2024-04-15 17:01:47 -04:00
}
2024-04-17 18:24:46 -07:00
export type PageOptions = {
onlyMainContent?: boolean;
2024-05-07 13:40:24 -03:00
includeHtml?: boolean;
2024-06-28 17:07:47 -04:00
includeRawHtml?: boolean;
2024-04-23 15:28:32 -07:00
fallback?: boolean;
fetchPageContent?: boolean;
2024-05-28 12:56:24 -07:00
waitFor?: number;
2024-05-29 18:56:57 -04:00
screenshot?: boolean;
2024-05-31 15:39:54 -07:00
headers?: Record<string, string>;
replaceAllPathsWithAbsolutePaths?: boolean;
parsePDF?: boolean;
2024-06-13 10:51:05 -03:00
removeTags?: string | string[];
2024-06-26 21:02:58 -03:00
onlyIncludeTags?: string | string[];
2024-04-17 18:24:46 -07:00
};
2024-04-23 15:44:11 -07:00
export type ExtractorOptions = {
2024-06-28 16:39:09 -04:00
mode: "markdown" | "llm-extraction" | "llm-extraction-from-markdown" | "llm-extraction-from-raw-html";
extractionPrompt?: string;
extractionSchema?: Record<string, any>;
}
2024-04-23 15:44:11 -07:00
export type SearchOptions = {
limit?: number;
2024-04-23 16:45:06 -07:00
tbs?: string;
filter?: string;
lang?: string;
country?: string;
location?: string;
2024-04-23 15:44:11 -07:00
};
export type CrawlerOptions = {
returnOnlyUrls?: boolean;
includes?: string[];
excludes?: string[];
maxCrawledLinks?: number;
maxDepth?: number;
limit?: number;
generateImgAltText?: boolean;
replaceAllPathsWithAbsolutePaths?: boolean;
ignoreSitemap?: boolean;
mode?: "default" | "fast"; // have a mode of some sort
2024-06-11 15:24:39 -03:00
allowBackwardCrawling?: boolean;
2024-06-28 17:23:40 -07:00
allowExternalContentLinks?: boolean;
}
2024-04-17 18:24:46 -07:00
export type WebScraperOptions = {
2024-07-24 14:31:25 +02:00
jobId: string;
2024-04-17 18:24:46 -07:00
urls: string[];
mode: "single_urls" | "sitemap" | "crawl";
crawlerOptions?: CrawlerOptions;
2024-04-17 18:24:46 -07:00
pageOptions?: PageOptions;
extractorOptions?: ExtractorOptions;
2024-04-17 18:24:46 -07:00
concurrentRequests?: number;
2024-05-06 17:16:43 -07:00
bullJobId?: string;
2024-04-17 18:24:46 -07:00
};
2024-04-20 11:59:42 -07:00
export interface DocumentUrl {
url: string;
}
2024-04-15 17:01:47 -04:00
export class Document {
id?: string;
2024-04-23 17:14:34 -07:00
url?: string; // Used only in /search for now
2024-04-15 17:01:47 -04:00
content: string;
markdown?: string;
2024-04-30 09:20:15 -07:00
html?: string;
2024-06-28 16:39:09 -04:00
rawHtml?: string;
llm_extraction?: Record<string, any>;
2024-04-15 17:01:47 -04:00
createdAt?: Date;
updatedAt?: Date;
type?: string;
metadata: {
sourceURL?: string;
[key: string]: any;
};
childrenLinks?: string[];
2024-04-18 11:43:57 -03:00
provider?: string;
warning?: string;
2024-04-15 17:01:47 -04:00
2024-06-05 11:11:09 -07:00
index?: number;
linksOnPage?: string[]; // Add this new field as a separate property
2024-04-15 17:01:47 -04:00
constructor(data: Partial<Document>) {
if (!data.content) {
throw new Error("Missing required fields");
}
this.content = data.content;
this.createdAt = data.createdAt || new Date();
this.updatedAt = data.updatedAt || new Date();
this.type = data.type || "unknown";
this.metadata = data.metadata || { sourceURL: "" };
this.markdown = data.markdown || "";
this.childrenLinks = data.childrenLinks || undefined;
2024-04-18 11:43:57 -03:00
this.provider = data.provider || undefined;
this.linksOnPage = data.linksOnPage; // Assign linksOnPage if provided
2024-04-15 17:01:47 -04:00
}
}
2024-04-24 10:11:01 -07:00
export class SearchResult {
url: string;
title: string;
description: string;
constructor(url: string, title: string, description: string) {
this.url = url;
this.title = title;
this.description = description;
}
toString(): string {
return `SearchResult(url=${this.url}, title=${this.title}, description=${this.description})`;
}
2024-05-29 19:43:51 -07:00
}
export interface FireEngineResponse {
html: string;
screenshot: string;
pageStatusCode?: number;
pageError?: string;
}
2024-07-12 22:02:08 -04:00
export interface FireEngineOptions{
mobileProxy?: boolean;
method?: string;
engine?: string;
blockMedia?: boolean;
blockAds?: boolean;
2024-07-26 18:25:44 -04:00
disableJsDom?: boolean;
2024-07-12 22:02:08 -04:00
}