2024-05-06 17:16:43 -07:00
|
|
|
import {
|
|
|
|
Document,
|
|
|
|
ExtractorOptions,
|
|
|
|
PageOptions,
|
|
|
|
WebScraperOptions,
|
|
|
|
} from "../../lib/entities";
|
2024-04-15 17:01:47 -04:00
|
|
|
import { Progress } from "../../lib/entities";
|
|
|
|
import { scrapSingleUrl } from "./single_url";
|
|
|
|
import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap";
|
|
|
|
import { WebCrawler } from "./crawler";
|
|
|
|
import { getValue, setValue } from "../../services/redis";
|
2024-04-16 18:03:48 -03:00
|
|
|
import { getImageDescription } from "./utils/imageDescription";
|
2024-05-13 16:13:10 -03:00
|
|
|
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
|
2024-05-06 17:16:43 -07:00
|
|
|
import {
|
|
|
|
replaceImgPathsWithAbsolutePaths,
|
|
|
|
replacePathsWithAbsolutePaths,
|
|
|
|
} from "./utils/replacePaths";
|
2024-04-28 17:38:20 -07:00
|
|
|
import { generateCompletions } from "../../lib/LLM-extraction";
|
2024-05-06 17:16:43 -07:00
|
|
|
import { getWebScraperQueue } from "../../../src/services/queue-service";
|
2024-05-16 11:48:02 -07:00
|
|
|
import { fetchAndProcessDocx } from "./utils/docxProcessor";
|
2024-06-15 16:43:37 -04:00
|
|
|
import { getAdjustedMaxDepth, getURLDepth } from "./utils/maxDepthUtils";
|
2024-04-17 18:24:46 -07:00
|
|
|
|
2024-04-15 17:01:47 -04:00
|
|
|
export class WebScraperDataProvider {
|
2024-05-06 17:16:43 -07:00
|
|
|
private bullJobId: string;
|
2024-04-15 17:01:47 -04:00
|
|
|
private urls: string[] = [""];
|
|
|
|
private mode: "single_urls" | "sitemap" | "crawl" = "single_urls";
|
|
|
|
private includes: string[];
|
|
|
|
private excludes: string[];
|
|
|
|
private maxCrawledLinks: number;
|
2024-05-07 11:06:26 -03:00
|
|
|
private maxCrawledDepth: number = 10;
|
2024-04-15 17:01:47 -04:00
|
|
|
private returnOnlyUrls: boolean;
|
|
|
|
private limit: number = 10000;
|
|
|
|
private concurrentRequests: number = 20;
|
2024-04-16 12:49:14 -04:00
|
|
|
private generateImgAltText: boolean = false;
|
2024-06-10 18:12:41 -07:00
|
|
|
private ignoreSitemap: boolean = false;
|
2024-04-17 18:24:46 -07:00
|
|
|
private pageOptions?: PageOptions;
|
2024-04-28 15:52:09 -07:00
|
|
|
private extractorOptions?: ExtractorOptions;
|
2024-04-19 11:47:20 -03:00
|
|
|
private replaceAllPathsWithAbsolutePaths?: boolean = false;
|
2024-05-06 17:16:43 -07:00
|
|
|
private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" =
|
|
|
|
"gpt-4-turbo";
|
2024-05-13 21:10:58 -07:00
|
|
|
private crawlerMode: string = "default";
|
2024-06-11 15:24:39 -03:00
|
|
|
private allowBackwardCrawling: boolean = false;
|
2024-04-15 17:01:47 -04:00
|
|
|
|
|
|
|
authorize(): void {
|
|
|
|
throw new Error("Method not implemented.");
|
|
|
|
}
|
|
|
|
|
|
|
|
authorizeNango(): Promise<void> {
|
|
|
|
throw new Error("Method not implemented.");
|
|
|
|
}
|
|
|
|
|
|
|
|
private async convertUrlsToDocuments(
|
|
|
|
urls: string[],
|
2024-05-13 20:45:11 -07:00
|
|
|
inProgress?: (progress: Progress) => void,
|
|
|
|
allHtmls?: string[]
|
2024-04-15 17:01:47 -04:00
|
|
|
): Promise<Document[]> {
|
|
|
|
const totalUrls = urls.length;
|
|
|
|
let processedUrls = 0;
|
2024-05-06 17:16:43 -07:00
|
|
|
|
2024-04-15 17:01:47 -04:00
|
|
|
const results: (Document | null)[] = new Array(urls.length).fill(null);
|
|
|
|
for (let i = 0; i < urls.length; i += this.concurrentRequests) {
|
|
|
|
const batchUrls = urls.slice(i, i + this.concurrentRequests);
|
2024-04-17 12:51:12 -07:00
|
|
|
await Promise.all(
|
|
|
|
batchUrls.map(async (url, index) => {
|
2024-05-15 11:28:20 -07:00
|
|
|
const existingHTML = allHtmls ? allHtmls[i + index] : "";
|
2024-05-15 17:13:04 -07:00
|
|
|
const result = await scrapSingleUrl(
|
|
|
|
url,
|
|
|
|
this.pageOptions,
|
|
|
|
existingHTML
|
|
|
|
);
|
2024-04-17 12:51:12 -07:00
|
|
|
processedUrls++;
|
|
|
|
if (inProgress) {
|
|
|
|
inProgress({
|
|
|
|
current: processedUrls,
|
|
|
|
total: totalUrls,
|
|
|
|
status: "SCRAPING",
|
|
|
|
currentDocumentUrl: url,
|
2024-06-05 11:11:09 -07:00
|
|
|
currentDocument: { ...result, index: processedUrls },
|
2024-04-17 12:51:12 -07:00
|
|
|
});
|
|
|
|
}
|
2024-05-06 17:16:43 -07:00
|
|
|
|
2024-04-17 12:51:12 -07:00
|
|
|
results[i + index] = result;
|
|
|
|
})
|
|
|
|
);
|
2024-05-07 09:26:52 -07:00
|
|
|
try {
|
|
|
|
if (this.mode === "crawl" && this.bullJobId) {
|
|
|
|
const job = await getWebScraperQueue().getJob(this.bullJobId);
|
|
|
|
const jobStatus = await job.getState();
|
|
|
|
if (jobStatus === "failed") {
|
2024-06-04 11:05:50 -03:00
|
|
|
console.error(
|
2024-05-07 09:26:52 -07:00
|
|
|
"Job has failed or has been cancelled by the user. Stopping the job..."
|
|
|
|
);
|
2024-06-04 11:05:50 -03:00
|
|
|
return [] as Document[];
|
2024-05-07 09:26:52 -07:00
|
|
|
}
|
|
|
|
}
|
2024-05-07 14:03:00 -03:00
|
|
|
} catch (error) {
|
|
|
|
console.error(error);
|
2024-06-04 11:05:50 -03:00
|
|
|
return [] as Document[];
|
2024-05-07 14:03:00 -03:00
|
|
|
}
|
2024-04-15 17:01:47 -04:00
|
|
|
}
|
|
|
|
return results.filter((result) => result !== null) as Document[];
|
|
|
|
}
|
|
|
|
|
|
|
|
async getDocuments(
|
|
|
|
useCaching: boolean = false,
|
|
|
|
inProgress?: (progress: Progress) => void
|
|
|
|
): Promise<Document[]> {
|
2024-05-04 11:53:16 -07:00
|
|
|
this.validateInitialUrl();
|
2024-04-18 13:53:11 -03:00
|
|
|
if (!useCaching) {
|
2024-05-04 11:53:16 -07:00
|
|
|
return this.processDocumentsWithoutCache(inProgress);
|
|
|
|
}
|
2024-04-19 11:47:20 -03:00
|
|
|
|
2024-05-04 11:53:16 -07:00
|
|
|
return this.processDocumentsWithCache(inProgress);
|
|
|
|
}
|
2024-04-15 17:01:47 -04:00
|
|
|
|
2024-05-04 11:53:16 -07:00
|
|
|
private validateInitialUrl(): void {
|
|
|
|
if (this.urls[0].trim() === "") {
|
|
|
|
throw new Error("Url is required");
|
|
|
|
}
|
|
|
|
}
|
2024-04-15 17:01:47 -04:00
|
|
|
|
2024-05-04 12:44:30 -07:00
|
|
|
/**
|
|
|
|
* Process documents without cache handling each mode
|
|
|
|
* @param inProgress inProgress
|
|
|
|
* @returns documents
|
|
|
|
*/
|
2024-05-06 17:16:43 -07:00
|
|
|
private async processDocumentsWithoutCache(
|
|
|
|
inProgress?: (progress: Progress) => void
|
|
|
|
): Promise<Document[]> {
|
2024-05-04 11:53:16 -07:00
|
|
|
switch (this.mode) {
|
|
|
|
case "crawl":
|
|
|
|
return this.handleCrawlMode(inProgress);
|
|
|
|
case "single_urls":
|
|
|
|
return this.handleSingleUrlsMode(inProgress);
|
|
|
|
case "sitemap":
|
|
|
|
return this.handleSitemapMode(inProgress);
|
|
|
|
default:
|
|
|
|
return [];
|
|
|
|
}
|
|
|
|
}
|
2024-04-15 17:01:47 -04:00
|
|
|
|
2024-05-15 17:13:04 -07:00
|
|
|
private async cleanIrrelevantPath(links: string[]) {
|
|
|
|
return links.filter((link) => {
|
2024-05-15 15:30:37 -07:00
|
|
|
const normalizedInitialUrl = new URL(this.urls[0]);
|
|
|
|
const normalizedLink = new URL(link);
|
|
|
|
|
|
|
|
// Normalize the hostname to account for www and non-www versions
|
2024-05-15 17:13:04 -07:00
|
|
|
const initialHostname = normalizedInitialUrl.hostname.replace(
|
|
|
|
/^www\./,
|
|
|
|
""
|
|
|
|
);
|
|
|
|
const linkHostname = normalizedLink.hostname.replace(/^www\./, "");
|
2024-05-15 15:30:37 -07:00
|
|
|
|
|
|
|
// Ensure the protocol and hostname match, and the path starts with the initial URL's path
|
2024-05-15 17:13:04 -07:00
|
|
|
return (
|
|
|
|
linkHostname === initialHostname &&
|
|
|
|
normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)
|
|
|
|
);
|
2024-05-15 15:30:37 -07:00
|
|
|
});
|
|
|
|
}
|
|
|
|
|
2024-05-06 17:16:43 -07:00
|
|
|
private async handleCrawlMode(
|
|
|
|
inProgress?: (progress: Progress) => void
|
|
|
|
): Promise<Document[]> {
|
2024-05-16 11:48:02 -07:00
|
|
|
|
2024-05-04 11:53:16 -07:00
|
|
|
const crawler = new WebCrawler({
|
|
|
|
initialUrl: this.urls[0],
|
|
|
|
includes: this.includes,
|
|
|
|
excludes: this.excludes,
|
|
|
|
maxCrawledLinks: this.maxCrawledLinks,
|
2024-06-15 16:43:37 -04:00
|
|
|
maxCrawledDepth: getAdjustedMaxDepth(this.urls[0], this.maxCrawledDepth),
|
2024-05-04 11:53:16 -07:00
|
|
|
limit: this.limit,
|
|
|
|
generateImgAltText: this.generateImgAltText,
|
2024-06-11 15:24:39 -03:00
|
|
|
allowBackwardCrawling: this.allowBackwardCrawling,
|
2024-05-04 11:53:16 -07:00
|
|
|
});
|
2024-05-15 11:06:03 -07:00
|
|
|
|
2024-05-15 17:13:04 -07:00
|
|
|
let links = await crawler.start(
|
|
|
|
inProgress,
|
2024-06-10 16:27:10 -07:00
|
|
|
this.pageOptions,
|
2024-06-10 18:12:41 -07:00
|
|
|
{
|
|
|
|
ignoreSitemap: this.ignoreSitemap,
|
|
|
|
},
|
2024-05-15 17:13:04 -07:00
|
|
|
5,
|
|
|
|
this.limit,
|
|
|
|
this.maxCrawledDepth
|
|
|
|
);
|
2024-05-15 11:06:03 -07:00
|
|
|
|
2024-05-15 18:35:09 -03:00
|
|
|
let allLinks = links.map((e) => e.url);
|
2024-05-15 17:13:04 -07:00
|
|
|
const allHtmls = links.map((e) => e.html);
|
2024-05-15 18:54:40 -03:00
|
|
|
|
2024-05-04 11:53:16 -07:00
|
|
|
if (this.returnOnlyUrls) {
|
2024-05-15 17:13:04 -07:00
|
|
|
return this.returnOnlyUrlsResponse(allLinks, inProgress);
|
2024-05-04 11:53:16 -07:00
|
|
|
}
|
2024-04-18 11:43:57 -03:00
|
|
|
|
2024-05-13 20:45:11 -07:00
|
|
|
let documents = [];
|
|
|
|
// check if fast mode is enabled and there is html inside the links
|
2024-05-13 21:10:58 -07:00
|
|
|
if (this.crawlerMode === "fast" && links.some((link) => link.html)) {
|
2024-05-13 20:45:11 -07:00
|
|
|
documents = await this.processLinks(allLinks, inProgress, allHtmls);
|
2024-05-15 17:13:04 -07:00
|
|
|
} else {
|
2024-05-14 12:12:40 -07:00
|
|
|
documents = await this.processLinks(allLinks, inProgress);
|
2024-05-13 20:45:11 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
return this.cacheAndFinalizeDocuments(documents, allLinks);
|
2024-05-04 11:53:16 -07:00
|
|
|
}
|
2024-04-19 11:47:20 -03:00
|
|
|
|
2024-05-06 17:16:43 -07:00
|
|
|
private async handleSingleUrlsMode(
|
|
|
|
inProgress?: (progress: Progress) => void
|
|
|
|
): Promise<Document[]> {
|
2024-05-13 09:13:42 -03:00
|
|
|
const links = this.urls;
|
|
|
|
|
|
|
|
let documents = await this.processLinks(links, inProgress);
|
2024-05-04 11:53:16 -07:00
|
|
|
return documents;
|
|
|
|
}
|
2024-04-19 11:47:20 -03:00
|
|
|
|
2024-05-06 17:16:43 -07:00
|
|
|
private async handleSitemapMode(
|
|
|
|
inProgress?: (progress: Progress) => void
|
|
|
|
): Promise<Document[]> {
|
2024-05-04 11:53:16 -07:00
|
|
|
let links = await getLinksFromSitemap(this.urls[0]);
|
2024-05-15 15:30:37 -07:00
|
|
|
links = await this.cleanIrrelevantPath(links);
|
2024-05-15 18:54:40 -03:00
|
|
|
|
2024-05-04 11:53:16 -07:00
|
|
|
if (this.returnOnlyUrls) {
|
|
|
|
return this.returnOnlyUrlsResponse(links, inProgress);
|
|
|
|
}
|
2024-04-29 15:15:32 -03:00
|
|
|
|
2024-05-13 20:51:42 -07:00
|
|
|
|
2024-05-04 11:53:16 -07:00
|
|
|
let documents = await this.processLinks(links, inProgress);
|
|
|
|
return this.cacheAndFinalizeDocuments(documents, links);
|
|
|
|
}
|
2024-04-18 11:43:57 -03:00
|
|
|
|
2024-05-06 17:16:43 -07:00
|
|
|
private async returnOnlyUrlsResponse(
|
|
|
|
links: string[],
|
|
|
|
inProgress?: (progress: Progress) => void
|
|
|
|
): Promise<Document[]> {
|
2024-05-04 11:53:16 -07:00
|
|
|
inProgress?.({
|
|
|
|
current: links.length,
|
|
|
|
total: links.length,
|
|
|
|
status: "COMPLETED",
|
|
|
|
currentDocumentUrl: this.urls[0],
|
|
|
|
});
|
2024-05-06 17:16:43 -07:00
|
|
|
return links.map((url) => ({
|
2024-05-04 11:53:16 -07:00
|
|
|
content: "",
|
2024-05-07 13:40:24 -03:00
|
|
|
html: this.pageOptions?.includeHtml ? "" : undefined,
|
2024-05-04 11:53:16 -07:00
|
|
|
markdown: "",
|
2024-06-13 17:08:40 -03:00
|
|
|
metadata: { sourceURL: url, pageStatusCode: 200 },
|
2024-05-04 11:53:16 -07:00
|
|
|
}));
|
|
|
|
}
|
2024-04-15 17:01:47 -04:00
|
|
|
|
2024-05-06 17:16:43 -07:00
|
|
|
private async processLinks(
|
|
|
|
links: string[],
|
2024-05-13 20:45:11 -07:00
|
|
|
inProgress?: (progress: Progress) => void,
|
|
|
|
allHtmls?: string[]
|
2024-05-06 17:16:43 -07:00
|
|
|
): Promise<Document[]> {
|
2024-05-16 11:48:02 -07:00
|
|
|
const pdfLinks = links.filter(link => link.endsWith(".pdf"));
|
|
|
|
const docLinks = links.filter(link => link.endsWith(".doc") || link.endsWith(".docx"));
|
|
|
|
|
|
|
|
const pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
|
|
|
|
const docxDocuments = await this.fetchDocxDocuments(docLinks);
|
2024-04-15 17:01:47 -04:00
|
|
|
|
2024-05-16 11:48:02 -07:00
|
|
|
links = links.filter(link => !pdfLinks.includes(link) && !docLinks.includes(link));
|
2024-05-13 20:45:11 -07:00
|
|
|
|
2024-05-15 17:13:04 -07:00
|
|
|
let documents = await this.convertUrlsToDocuments(
|
|
|
|
links,
|
|
|
|
inProgress,
|
|
|
|
allHtmls
|
|
|
|
);
|
2024-06-24 16:25:07 -03:00
|
|
|
|
2024-05-04 11:53:16 -07:00
|
|
|
documents = await this.getSitemapData(this.urls[0], documents);
|
|
|
|
documents = this.applyPathReplacements(documents);
|
2024-05-13 10:56:08 -07:00
|
|
|
// documents = await this.applyImgAltText(documents);
|
2024-05-06 17:16:43 -07:00
|
|
|
|
|
|
|
if (
|
|
|
|
this.extractorOptions.mode === "llm-extraction" &&
|
|
|
|
this.mode === "single_urls"
|
|
|
|
) {
|
|
|
|
documents = await generateCompletions(documents, this.extractorOptions);
|
2024-05-04 12:30:12 -07:00
|
|
|
}
|
2024-05-16 11:48:02 -07:00
|
|
|
return documents.concat(pdfDocuments).concat(docxDocuments);
|
2024-05-04 11:53:16 -07:00
|
|
|
}
|
2024-04-19 11:47:20 -03:00
|
|
|
|
2024-05-04 11:53:16 -07:00
|
|
|
private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> {
|
2024-05-06 17:16:43 -07:00
|
|
|
return Promise.all(
|
|
|
|
pdfLinks.map(async (pdfLink) => {
|
2024-06-14 11:47:58 -03:00
|
|
|
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(pdfLink, this.pageOptions.parsePDF);
|
2024-05-06 17:16:43 -07:00
|
|
|
return {
|
2024-06-13 17:08:40 -03:00
|
|
|
content: content,
|
|
|
|
metadata: { sourceURL: pdfLink, pageStatusCode, pageError },
|
2024-05-06 17:16:43 -07:00
|
|
|
provider: "web-scraper",
|
|
|
|
};
|
|
|
|
})
|
|
|
|
);
|
2024-05-04 11:53:16 -07:00
|
|
|
}
|
2024-05-16 11:48:02 -07:00
|
|
|
private async fetchDocxDocuments(docxLinks: string[]): Promise<Document[]> {
|
|
|
|
return Promise.all(
|
|
|
|
docxLinks.map(async (p) => {
|
2024-06-13 17:08:40 -03:00
|
|
|
const { content, pageStatusCode, pageError } = await fetchAndProcessDocx(p);
|
2024-05-16 11:48:02 -07:00
|
|
|
return {
|
2024-06-13 17:08:40 -03:00
|
|
|
content,
|
|
|
|
metadata: { sourceURL: p, pageStatusCode, pageError },
|
2024-05-16 11:48:02 -07:00
|
|
|
provider: "web-scraper",
|
|
|
|
};
|
|
|
|
})
|
|
|
|
);
|
|
|
|
}
|
2024-04-19 11:47:20 -03:00
|
|
|
|
2024-05-04 11:53:16 -07:00
|
|
|
private applyPathReplacements(documents: Document[]): Document[] {
|
2024-06-11 12:43:16 -07:00
|
|
|
if (this.replaceAllPathsWithAbsolutePaths) {
|
|
|
|
documents = replacePathsWithAbsolutePaths(documents);
|
|
|
|
}
|
|
|
|
return replaceImgPathsWithAbsolutePaths(documents);
|
2024-05-04 11:53:16 -07:00
|
|
|
}
|
2024-04-17 12:51:12 -07:00
|
|
|
|
2024-05-04 11:53:16 -07:00
|
|
|
private async applyImgAltText(documents: Document[]): Promise<Document[]> {
|
2024-05-06 17:16:43 -07:00
|
|
|
return this.generateImgAltText
|
|
|
|
? this.generatesImgAltText(documents)
|
|
|
|
: documents;
|
2024-05-04 11:53:16 -07:00
|
|
|
}
|
2024-04-15 17:01:47 -04:00
|
|
|
|
2024-05-06 17:16:43 -07:00
|
|
|
private async cacheAndFinalizeDocuments(
|
|
|
|
documents: Document[],
|
|
|
|
links: string[]
|
|
|
|
): Promise<Document[]> {
|
2024-05-04 11:53:16 -07:00
|
|
|
await this.setCachedDocuments(documents, links);
|
|
|
|
documents = this.removeChildLinks(documents);
|
|
|
|
return documents.splice(0, this.limit);
|
|
|
|
}
|
2024-04-15 17:01:47 -04:00
|
|
|
|
2024-05-06 17:16:43 -07:00
|
|
|
private async processDocumentsWithCache(
|
|
|
|
inProgress?: (progress: Progress) => void
|
|
|
|
): Promise<Document[]> {
|
2024-04-17 12:51:12 -07:00
|
|
|
let documents = await this.getCachedDocuments(
|
|
|
|
this.urls.slice(0, this.limit)
|
|
|
|
);
|
2024-04-15 17:01:47 -04:00
|
|
|
if (documents.length < this.limit) {
|
2024-04-17 12:51:12 -07:00
|
|
|
const newDocuments: Document[] = await this.getDocuments(
|
|
|
|
false,
|
|
|
|
inProgress
|
|
|
|
);
|
2024-05-04 11:53:16 -07:00
|
|
|
documents = this.mergeNewDocuments(documents, newDocuments);
|
2024-04-15 17:01:47 -04:00
|
|
|
}
|
|
|
|
documents = this.filterDocsExcludeInclude(documents);
|
2024-05-07 11:06:26 -03:00
|
|
|
documents = this.filterDepth(documents);
|
2024-04-15 17:01:47 -04:00
|
|
|
documents = this.removeChildLinks(documents);
|
2024-05-04 11:53:16 -07:00
|
|
|
return documents.splice(0, this.limit);
|
|
|
|
}
|
|
|
|
|
2024-05-06 17:16:43 -07:00
|
|
|
private mergeNewDocuments(
|
|
|
|
existingDocuments: Document[],
|
|
|
|
newDocuments: Document[]
|
|
|
|
): Document[] {
|
|
|
|
newDocuments.forEach((doc) => {
|
|
|
|
if (
|
|
|
|
!existingDocuments.some(
|
|
|
|
(d) =>
|
|
|
|
this.normalizeUrl(d.metadata.sourceURL) ===
|
|
|
|
this.normalizeUrl(doc.metadata?.sourceURL)
|
|
|
|
)
|
|
|
|
) {
|
2024-05-04 11:53:16 -07:00
|
|
|
existingDocuments.push(doc);
|
|
|
|
}
|
|
|
|
});
|
|
|
|
return existingDocuments;
|
2024-04-15 17:01:47 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
private filterDocsExcludeInclude(documents: Document[]): Document[] {
|
|
|
|
return documents.filter((document) => {
|
|
|
|
const url = new URL(document.metadata.sourceURL);
|
|
|
|
const path = url.pathname;
|
|
|
|
|
2024-04-17 12:51:12 -07:00
|
|
|
if (this.excludes.length > 0 && this.excludes[0] !== "") {
|
2024-04-15 17:01:47 -04:00
|
|
|
// Check if the link should be excluded
|
2024-04-17 12:51:12 -07:00
|
|
|
if (
|
|
|
|
this.excludes.some((excludePattern) =>
|
|
|
|
new RegExp(excludePattern).test(path)
|
|
|
|
)
|
|
|
|
) {
|
2024-04-15 17:01:47 -04:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
2024-04-17 12:51:12 -07:00
|
|
|
|
|
|
|
if (this.includes.length > 0 && this.includes[0] !== "") {
|
2024-04-15 17:01:47 -04:00
|
|
|
// Check if the link matches the include patterns, if any are specified
|
|
|
|
if (this.includes.length > 0) {
|
2024-04-17 12:51:12 -07:00
|
|
|
return this.includes.some((includePattern) =>
|
|
|
|
new RegExp(includePattern).test(path)
|
|
|
|
);
|
2024-04-15 17:01:47 -04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
private normalizeUrl(url: string): string {
|
|
|
|
if (url.includes("//www.")) {
|
|
|
|
return url.replace("//www.", "//");
|
|
|
|
}
|
|
|
|
return url;
|
|
|
|
}
|
|
|
|
|
|
|
|
private removeChildLinks(documents: Document[]): Document[] {
|
|
|
|
for (let document of documents) {
|
|
|
|
if (document?.childrenLinks) delete document.childrenLinks;
|
2024-04-17 12:51:12 -07:00
|
|
|
}
|
2024-04-15 17:01:47 -04:00
|
|
|
return documents;
|
|
|
|
}
|
|
|
|
|
|
|
|
async setCachedDocuments(documents: Document[], childrenLinks?: string[]) {
|
|
|
|
for (const document of documents) {
|
|
|
|
if (document.content.trim().length === 0) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
const normalizedUrl = this.normalizeUrl(document.metadata.sourceURL);
|
2024-04-17 12:51:12 -07:00
|
|
|
await setValue(
|
|
|
|
"web-scraper-cache:" + normalizedUrl,
|
|
|
|
JSON.stringify({
|
|
|
|
...document,
|
|
|
|
childrenLinks: childrenLinks || [],
|
|
|
|
}),
|
|
|
|
60 * 60 * 24 * 10
|
|
|
|
); // 10 days
|
2024-04-15 17:01:47 -04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
async getCachedDocuments(urls: string[]): Promise<Document[]> {
|
|
|
|
let documents: Document[] = [];
|
|
|
|
for (const url of urls) {
|
|
|
|
const normalizedUrl = this.normalizeUrl(url);
|
2024-04-17 12:51:12 -07:00
|
|
|
console.log(
|
|
|
|
"Getting cached document for web-scraper-cache:" + normalizedUrl
|
|
|
|
);
|
|
|
|
const cachedDocumentString = await getValue(
|
|
|
|
"web-scraper-cache:" + normalizedUrl
|
|
|
|
);
|
2024-04-15 17:01:47 -04:00
|
|
|
if (cachedDocumentString) {
|
|
|
|
const cachedDocument = JSON.parse(cachedDocumentString);
|
|
|
|
documents.push(cachedDocument);
|
|
|
|
|
|
|
|
// get children documents
|
2024-05-06 17:16:43 -07:00
|
|
|
for (const childUrl of cachedDocument.childrenLinks || []) {
|
2024-04-15 17:01:47 -04:00
|
|
|
const normalizedChildUrl = this.normalizeUrl(childUrl);
|
2024-04-17 12:51:12 -07:00
|
|
|
const childCachedDocumentString = await getValue(
|
|
|
|
"web-scraper-cache:" + normalizedChildUrl
|
|
|
|
);
|
2024-04-15 17:01:47 -04:00
|
|
|
if (childCachedDocumentString) {
|
|
|
|
const childCachedDocument = JSON.parse(childCachedDocumentString);
|
2024-04-17 12:51:12 -07:00
|
|
|
if (
|
|
|
|
!documents.find(
|
|
|
|
(doc) =>
|
|
|
|
doc.metadata.sourceURL ===
|
|
|
|
childCachedDocument.metadata.sourceURL
|
|
|
|
)
|
|
|
|
) {
|
2024-04-15 17:01:47 -04:00
|
|
|
documents.push(childCachedDocument);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return documents;
|
|
|
|
}
|
|
|
|
|
|
|
|
setOptions(options: WebScraperOptions): void {
|
|
|
|
if (!options.urls) {
|
|
|
|
throw new Error("Urls are required");
|
|
|
|
}
|
|
|
|
|
2024-05-06 17:16:43 -07:00
|
|
|
this.bullJobId = options.bullJobId;
|
2024-04-15 17:01:47 -04:00
|
|
|
this.urls = options.urls;
|
|
|
|
this.mode = options.mode;
|
|
|
|
this.concurrentRequests = options.concurrentRequests ?? 20;
|
|
|
|
this.includes = options.crawlerOptions?.includes ?? [];
|
|
|
|
this.excludes = options.crawlerOptions?.excludes ?? [];
|
|
|
|
this.maxCrawledLinks = options.crawlerOptions?.maxCrawledLinks ?? 1000;
|
2024-05-07 11:06:26 -03:00
|
|
|
this.maxCrawledDepth = options.crawlerOptions?.maxDepth ?? 10;
|
2024-04-15 17:01:47 -04:00
|
|
|
this.returnOnlyUrls = options.crawlerOptions?.returnOnlyUrls ?? false;
|
|
|
|
this.limit = options.crawlerOptions?.limit ?? 10000;
|
2024-04-17 12:51:12 -07:00
|
|
|
this.generateImgAltText =
|
|
|
|
options.crawlerOptions?.generateImgAltText ?? false;
|
2024-06-12 15:06:47 -03:00
|
|
|
this.pageOptions = options.pageOptions ?? {
|
|
|
|
onlyMainContent: false,
|
|
|
|
includeHtml: false,
|
|
|
|
replaceAllPathsWithAbsolutePaths: false,
|
2024-06-14 11:25:20 -03:00
|
|
|
parsePDF: true,
|
2024-06-13 10:51:05 -03:00
|
|
|
removeTags: []
|
2024-06-12 15:06:47 -03:00
|
|
|
};
|
2024-04-28 15:52:09 -07:00
|
|
|
this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
|
2024-06-11 12:43:16 -07:00
|
|
|
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false;
|
2024-04-20 02:27:53 +09:00
|
|
|
//! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
|
2024-04-17 12:51:12 -07:00
|
|
|
this.excludes = this.excludes.filter((item) => item !== "");
|
2024-05-13 21:10:58 -07:00
|
|
|
this.crawlerMode = options.crawlerOptions?.mode ?? "default";
|
2024-06-10 18:12:41 -07:00
|
|
|
this.ignoreSitemap = options.crawlerOptions?.ignoreSitemap ?? false;
|
2024-06-11 15:24:39 -03:00
|
|
|
this.allowBackwardCrawling = options.crawlerOptions?.allowBackwardCrawling ?? false;
|
2024-04-17 12:51:12 -07:00
|
|
|
|
2024-04-15 17:01:47 -04:00
|
|
|
// make sure all urls start with https://
|
|
|
|
this.urls = this.urls.map((url) => {
|
|
|
|
if (!url.trim().startsWith("http")) {
|
|
|
|
return `https://${url}`;
|
|
|
|
}
|
|
|
|
return url;
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
private async getSitemapData(baseUrl: string, documents: Document[]) {
|
2024-04-17 12:51:12 -07:00
|
|
|
const sitemapData = await fetchSitemapData(baseUrl);
|
2024-04-15 17:01:47 -04:00
|
|
|
if (sitemapData) {
|
|
|
|
for (let i = 0; i < documents.length; i++) {
|
2024-04-17 12:51:12 -07:00
|
|
|
const docInSitemapData = sitemapData.find(
|
|
|
|
(data) =>
|
|
|
|
this.normalizeUrl(data.loc) ===
|
|
|
|
this.normalizeUrl(documents[i].metadata.sourceURL)
|
|
|
|
);
|
2024-04-15 17:01:47 -04:00
|
|
|
if (docInSitemapData) {
|
|
|
|
let sitemapDocData: Partial<SitemapEntry> = {};
|
|
|
|
if (docInSitemapData.changefreq) {
|
|
|
|
sitemapDocData.changefreq = docInSitemapData.changefreq;
|
|
|
|
}
|
|
|
|
if (docInSitemapData.priority) {
|
|
|
|
sitemapDocData.priority = Number(docInSitemapData.priority);
|
|
|
|
}
|
|
|
|
if (docInSitemapData.lastmod) {
|
|
|
|
sitemapDocData.lastmod = docInSitemapData.lastmod;
|
|
|
|
}
|
|
|
|
if (Object.keys(sitemapDocData).length !== 0) {
|
|
|
|
documents[i].metadata.sitemap = sitemapDocData;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return documents;
|
|
|
|
}
|
2024-04-16 12:49:14 -04:00
|
|
|
generatesImgAltText = async (documents: Document[]): Promise<Document[]> => {
|
2024-04-17 12:51:12 -07:00
|
|
|
await Promise.all(
|
|
|
|
documents.map(async (document) => {
|
|
|
|
const images = document.content.match(/!\[.*?\]\((.*?)\)/g) || [];
|
2024-04-16 12:49:14 -04:00
|
|
|
|
2024-04-17 12:51:12 -07:00
|
|
|
await Promise.all(
|
|
|
|
images.map(async (image: string) => {
|
|
|
|
let imageUrl = image.match(/\(([^)]+)\)/)[1];
|
|
|
|
let altText = image.match(/\[(.*?)\]/)[1];
|
|
|
|
|
|
|
|
if (
|
|
|
|
!altText &&
|
|
|
|
!imageUrl.startsWith("data:image") &&
|
|
|
|
/\.(png|jpeg|gif|webp)$/.test(imageUrl)
|
|
|
|
) {
|
|
|
|
const imageIndex = document.content.indexOf(image);
|
|
|
|
const contentLength = document.content.length;
|
|
|
|
let backText = document.content.substring(
|
|
|
|
imageIndex + image.length,
|
|
|
|
Math.min(imageIndex + image.length + 1000, contentLength)
|
|
|
|
);
|
|
|
|
let frontTextStartIndex = Math.max(imageIndex - 1000, 0);
|
|
|
|
let frontText = document.content.substring(
|
|
|
|
frontTextStartIndex,
|
|
|
|
imageIndex
|
|
|
|
);
|
|
|
|
altText = await getImageDescription(
|
|
|
|
imageUrl,
|
|
|
|
backText,
|
2024-05-06 17:16:43 -07:00
|
|
|
frontText,
|
|
|
|
this.generateImgAltTextModel
|
|
|
|
);
|
2024-04-17 12:51:12 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
document.content = document.content.replace(
|
|
|
|
image,
|
|
|
|
``
|
|
|
|
);
|
|
|
|
})
|
|
|
|
);
|
|
|
|
})
|
|
|
|
);
|
2024-04-16 12:49:14 -04:00
|
|
|
|
|
|
|
return documents;
|
2024-04-17 12:51:12 -07:00
|
|
|
};
|
2024-05-07 11:06:26 -03:00
|
|
|
|
|
|
|
filterDepth(documents: Document[]): Document[] {
|
|
|
|
return documents.filter((document) => {
|
|
|
|
const url = new URL(document.metadata.sourceURL);
|
2024-06-15 16:43:37 -04:00
|
|
|
return getURLDepth(url.toString()) <= this.maxCrawledDepth;
|
2024-05-07 11:06:26 -03:00
|
|
|
});
|
|
|
|
}
|
2024-04-17 12:51:12 -07:00
|
|
|
}
|