firecrawl/apps/api/src/scraper/WebScraper/index.ts

import { Document, ExtractorOptions, PageOptions, WebScraperOptions } from "../../lib/entities";
import { Progress } from "../../lib/entities";
import { scrapSingleUrl } from "./single_url";
import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap";
import { WebCrawler } from "./crawler";
import { getValue, setValue } from "../../services/redis";
import { getImageDescription } from "./utils/imageDescription";
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
import { replaceImgPathsWithAbsolutePaths, replacePathsWithAbsolutePaths } from "./utils/replacePaths";
import { generateCompletions } from "../../lib/LLM-extraction";


export class WebScraperDataProvider {
  private urls: string[] = [""];
  private mode: "single_urls" | "sitemap" | "crawl" = "single_urls";
  private includes: string[];
  private excludes: string[];
  private maxCrawledLinks: number;
  private returnOnlyUrls: boolean;
  private limit: number = 10000;
  private concurrentRequests: number = 20;
  private generateImgAltText: boolean = false;
  private pageOptions?: PageOptions;
  private extractorOptions?: ExtractorOptions;
  private replaceAllPathsWithAbsolutePaths?: boolean = false;
  private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" = "gpt-4-turbo";
  private includeHtml: boolean = false;

  authorize(): void {
    throw new Error("Method not implemented.");
  }

  authorizeNango(): Promise<void> {
    throw new Error("Method not implemented.");
  }

  private async convertUrlsToDocuments(
    urls: string[],
    inProgress?: (progress: Progress) => void
  ): Promise<Document[]> {
    const totalUrls = urls.length;
    let processedUrls = 0;
  
    const results: (Document | null)[] = new Array(urls.length).fill(null);
    for (let i = 0; i < urls.length; i += this.concurrentRequests) {
      const batchUrls = urls.slice(i, i + this.concurrentRequests);
      await Promise.all(
        batchUrls.map(async (url, index) => {
          const result = await scrapSingleUrl(url, this.pageOptions, this.includeHtml);
          processedUrls++;
          if (inProgress) {
            inProgress({
              current: processedUrls,
              total: totalUrls,
              status: "SCRAPING",
              currentDocumentUrl: url,
              currentDocument: result
            });
          }
          results[i + index] = result;
        })
      );
    }
    return results.filter((result) => result !== null) as Document[];
  }

  async getDocuments(
    useCaching: boolean = false,
    inProgress?: (progress: Progress) => void
  ): Promise<Document[]> {
    this.validateInitialUrl();

    if (!useCaching) {
      return this.processDocumentsWithoutCache(inProgress);
    }

    return this.processDocumentsWithCache(inProgress);
  }

  private validateInitialUrl(): void {
    if (this.urls[0].trim() === "") {
      throw new Error("Url is required");
    }
  }

  /**
   * Process documents without cache handling each mode
   * @param inProgress inProgress
   * @returns documents
   */
  private async processDocumentsWithoutCache(inProgress?: (progress: Progress) => void): Promise<Document[]> {
    switch (this.mode) {
      case "crawl":
        return this.handleCrawlMode(inProgress);
      case "single_urls":
        return this.handleSingleUrlsMode(inProgress);
      case "sitemap":
        return this.handleSitemapMode(inProgress);
      default:
        return [];
    }
  }

  private async handleCrawlMode(inProgress?: (progress: Progress) => void): Promise<Document[]> {
    const crawler = new WebCrawler({
      initialUrl: this.urls[0],
      includes: this.includes,
      excludes: this.excludes,
      maxCrawledLinks: this.maxCrawledLinks,
      limit: this.limit,
      generateImgAltText: this.generateImgAltText,
      includeHtml: this.includeHtml,
    });
    let links = await crawler.start(inProgress, 5, this.limit);
    if (this.returnOnlyUrls) {
      return this.returnOnlyUrlsResponse(links, inProgress);
    }

    let documents = await this.processLinks(links, inProgress);
    return this.cacheAndFinalizeDocuments(documents, links);
  }

  private async handleSingleUrlsMode(inProgress?: (progress: Progress) => void): Promise<Document[]> {
    let documents = await this.processLinks(this.urls, inProgress);
    return documents;
  }

  private async handleSitemapMode(inProgress?: (progress: Progress) => void): Promise<Document[]> {
    let links = await getLinksFromSitemap(this.urls[0]);
    if (this.returnOnlyUrls) {
      return this.returnOnlyUrlsResponse(links, inProgress);
    }

    let documents = await this.processLinks(links, inProgress);
    return this.cacheAndFinalizeDocuments(documents, links);
  }

  private async returnOnlyUrlsResponse(links: string[], inProgress?: (progress: Progress) => void): Promise<Document[]> {
    inProgress?.({
      current: links.length,
      total: links.length,
      status: "COMPLETED",
      currentDocumentUrl: this.urls[0],
    });
    return links.map(url => ({
      content: "",
      html: this.includeHtml ? "" : undefined,
      markdown: "",
      metadata: { sourceURL: url },
    }));
  }

  private async processLinks(links: string[], inProgress?: (progress: Progress) => void): Promise<Document[]> {
    let pdfLinks = links.filter(link => link.endsWith(".pdf"));
    let pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
    links = links.filter(link => !link.endsWith(".pdf"));

    let documents = await this.convertUrlsToDocuments(links, inProgress);
    documents = await this.getSitemapData(this.urls[0], documents);
    documents = this.applyPathReplacements(documents);
    documents = await this.applyImgAltText(documents);
    
    if(this.extractorOptions.mode === "llm-extraction" && this.mode === "single_urls") {
      documents = await generateCompletions(
        documents,
        this.extractorOptions
      )
    }
    return documents.concat(pdfDocuments);
  }

  private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> {
    return Promise.all(pdfLinks.map(async pdfLink => {
      const pdfContent = await fetchAndProcessPdf(pdfLink);
      return {
        content: pdfContent,
        metadata: { sourceURL: pdfLink },
        provider: "web-scraper"
      };
    }));
  }

  private applyPathReplacements(documents: Document[]): Document[] {
    return this.replaceAllPathsWithAbsolutePaths ? replacePathsWithAbsolutePaths(documents) : replaceImgPathsWithAbsolutePaths(documents);
  }

  private async applyImgAltText(documents: Document[]): Promise<Document[]> {
    return this.generateImgAltText ? this.generatesImgAltText(documents) : documents;
  }

  private async cacheAndFinalizeDocuments(documents: Document[], links: string[]): Promise<Document[]> {
    await this.setCachedDocuments(documents, links);
    documents = this.removeChildLinks(documents);
    return documents.splice(0, this.limit);
  }

  private async processDocumentsWithCache(inProgress?: (progress: Progress) => void): Promise<Document[]> {
    let documents = await this.getCachedDocuments(this.urls.slice(0, this.limit));
    if (documents.length < this.limit) {
      const newDocuments: Document[] = await this.getDocuments(false, inProgress);
      documents = this.mergeNewDocuments(documents, newDocuments);
    }
    documents = this.filterDocsExcludeInclude(documents);
    documents = this.removeChildLinks(documents);
    return documents.splice(0, this.limit);
  }

  private mergeNewDocuments(existingDocuments: Document[], newDocuments: Document[]): Document[] {
    newDocuments.forEach(doc => {
      if (!existingDocuments.some(d => this.normalizeUrl(d.metadata.sourceURL) === this.normalizeUrl(doc.metadata?.sourceURL))) {
        existingDocuments.push(doc);
      }
    });
    return existingDocuments;
  }

  private filterDocsExcludeInclude(documents: Document[]): Document[] {
    return documents.filter((document) => {
      const url = new URL(document.metadata.sourceURL);
      const path = url.pathname;

      if (this.excludes.length > 0 && this.excludes[0] !== "") {
        // Check if the link should be excluded
        if (
          this.excludes.some((excludePattern) =>
            new RegExp(excludePattern).test(path)
          )
        ) {
          return false;
        }
      }

      if (this.includes.length > 0 && this.includes[0] !== "") {
        // Check if the link matches the include patterns, if any are specified
        if (this.includes.length > 0) {
          return this.includes.some((includePattern) =>
            new RegExp(includePattern).test(path)
          );
        }
      }
      return true;
    });
  }

  private normalizeUrl(url: string): string {
    if (url.includes("//www.")) {
      return url.replace("//www.", "//");
    }
    return url;
  }

  private removeChildLinks(documents: Document[]): Document[] {
    for (let document of documents) {
      if (document?.childrenLinks) delete document.childrenLinks;
    }
    return documents;
  }

  async setCachedDocuments(documents: Document[], childrenLinks?: string[]) {
    for (const document of documents) {
      if (document.content.trim().length === 0) {
        continue;
      }
      const normalizedUrl = this.normalizeUrl(document.metadata.sourceURL);
      await setValue(
        "web-scraper-cache:" + normalizedUrl,
        JSON.stringify({
          ...document,
          childrenLinks: childrenLinks || [],
        }),
        60 * 60 * 24 * 10
      ); // 10 days
    }
  }

  async getCachedDocuments(urls: string[]): Promise<Document[]> {
    let documents: Document[] = [];
    for (const url of urls) {
      const normalizedUrl = this.normalizeUrl(url);
      console.log(
        "Getting cached document for web-scraper-cache:" + normalizedUrl
      );
      const cachedDocumentString = await getValue(
        "web-scraper-cache:" + normalizedUrl
      );
      if (cachedDocumentString) {
        const cachedDocument = JSON.parse(cachedDocumentString);
        documents.push(cachedDocument);

        // get children documents
        for (const childUrl of (cachedDocument.childrenLinks || [])) {
          const normalizedChildUrl = this.normalizeUrl(childUrl);
          const childCachedDocumentString = await getValue(
            "web-scraper-cache:" + normalizedChildUrl
          );
          if (childCachedDocumentString) {
            const childCachedDocument = JSON.parse(childCachedDocumentString);
            if (
              !documents.find(
                (doc) =>
                  doc.metadata.sourceURL ===
                  childCachedDocument.metadata.sourceURL
              )
            ) {
              documents.push(childCachedDocument);
            }
          }
        }
      }
    }
    return documents;
  }

  setOptions(options: WebScraperOptions): void {
    if (!options.urls) {
      throw new Error("Urls are required");
    }

    this.urls = options.urls;
    this.mode = options.mode;
    this.concurrentRequests = options.concurrentRequests ?? 20;
    this.includes = options.crawlerOptions?.includes ?? [];
    this.excludes = options.crawlerOptions?.excludes ?? [];
    this.maxCrawledLinks = options.crawlerOptions?.maxCrawledLinks ?? 1000;
    this.returnOnlyUrls = options.crawlerOptions?.returnOnlyUrls ?? false;
    this.limit = options.crawlerOptions?.limit ?? 10000;
    this.generateImgAltText =
      options.crawlerOptions?.generateImgAltText ?? false;
    this.pageOptions = options.pageOptions ?? {onlyMainContent: false };
    this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
    this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;
    this.includeHtml = options?.includeHtml ?? false;
    //! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check
    this.excludes = this.excludes.filter((item) => item !== "");

    // make sure all urls start with https://
    this.urls = this.urls.map((url) => {
      if (!url.trim().startsWith("http")) {
        return `https://${url}`;
      }
      return url;
    });
  }

  private async getSitemapData(baseUrl: string, documents: Document[]) {
    const sitemapData = await fetchSitemapData(baseUrl);
    if (sitemapData) {
      for (let i = 0; i < documents.length; i++) {
        const docInSitemapData = sitemapData.find(
          (data) =>
            this.normalizeUrl(data.loc) ===
            this.normalizeUrl(documents[i].metadata.sourceURL)
        );
        if (docInSitemapData) {
          let sitemapDocData: Partial<SitemapEntry> = {};
          if (docInSitemapData.changefreq) {
            sitemapDocData.changefreq = docInSitemapData.changefreq;
          }
          if (docInSitemapData.priority) {
            sitemapDocData.priority = Number(docInSitemapData.priority);
          }
          if (docInSitemapData.lastmod) {
            sitemapDocData.lastmod = docInSitemapData.lastmod;
          }
          if (Object.keys(sitemapDocData).length !== 0) {
            documents[i].metadata.sitemap = sitemapDocData;
          }
        }
      }
    }
    return documents;
  }
  generatesImgAltText = async (documents: Document[]): Promise<Document[]> => {
    await Promise.all(
      documents.map(async (document) => {
        const images = document.content.match(/!\[.*?\]\((.*?)\)/g) || [];

        await Promise.all(
          images.map(async (image: string) => {
            let imageUrl = image.match(/\(([^)]+)\)/)[1];
            let altText = image.match(/\[(.*?)\]/)[1];

            if (
              !altText &&
              !imageUrl.startsWith("data:image") &&
              /\.(png|jpeg|gif|webp)$/.test(imageUrl)
            ) {
              const imageIndex = document.content.indexOf(image);
              const contentLength = document.content.length;
              let backText = document.content.substring(
                imageIndex + image.length,
                Math.min(imageIndex + image.length + 1000, contentLength)
              );
              let frontTextStartIndex = Math.max(imageIndex - 1000, 0);
              let frontText = document.content.substring(
                frontTextStartIndex,
                imageIndex
              );
              altText = await getImageDescription(
                imageUrl,
                backText,
                frontText
              , this.generateImgAltTextModel);
            }

            document.content = document.content.replace(
              image,
              `![${altText}](${imageUrl})`
            );
          })
        );
      })
    );

    return documents;
  };
}
Caleb: got it to a testable state I believe 2024-04-28 15:52:09 -07:00			`import { Document, ExtractorOptions, PageOptions, WebScraperOptions } from "../../lib/entities";`
Initial commit 2024-04-15 17:01:47 -04:00			`import { Progress } from "../../lib/entities";`
			`import { scrapSingleUrl } from "./single_url";`
			`import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap";`
			`import { WebCrawler } from "./crawler";`
			`import { getValue, setValue } from "../../services/redis";`
Added anthropic vision to getImageDescription function 2024-04-16 18:03:48 -03:00			`import { getImageDescription } from "./utils/imageDescription";`
[Feat] Adding pdf parser 2024-04-18 11:43:57 -03:00			`import { fetchAndProcessPdf } from "./utils/pdfProcessor";`
adding option to replace all relative paths with absolute paths 2024-04-19 11:47:20 -03:00			`import { replaceImgPathsWithAbsolutePaths, replacePathsWithAbsolutePaths } from "./utils/replacePaths";`
Caleb: first test passing 2024-04-28 17:38:20 -07:00			`import { generateCompletions } from "../../lib/LLM-extraction";`
Caleb: got it to a testable state I believe 2024-04-28 15:52:09 -07:00
Initial commit 2024-04-15 17:01:47 -04:00
			`export class WebScraperDataProvider {`
			`private urls: string[] = [""];`
			`private mode: "single_urls" \| "sitemap" \| "crawl" = "single_urls";`
			`private includes: string[];`
			`private excludes: string[];`
			`private maxCrawledLinks: number;`
			`private returnOnlyUrls: boolean;`
			`private limit: number = 10000;`
			`private concurrentRequests: number = 20;`
Nick: 2024-04-16 12:49:14 -04:00			`private generateImgAltText: boolean = false;`
Nick: 2024-04-17 18:24:46 -07:00			`private pageOptions?: PageOptions;`
Caleb: got it to a testable state I believe 2024-04-28 15:52:09 -07:00			`private extractorOptions?: ExtractorOptions;`
adding option to replace all relative paths with absolute paths 2024-04-19 11:47:20 -03:00			`private replaceAllPathsWithAbsolutePaths?: boolean = false;`
Update index.ts 2024-04-17 10:39:00 -07:00			`private generateImgAltTextModel: "gpt-4-turbo" \| "claude-3-opus" = "gpt-4-turbo";`
changed to `includeHtml` 2024-05-06 19:45:56 -03:00			`private includeHtml: boolean = false;`
Initial commit 2024-04-15 17:01:47 -04:00
			`authorize(): void {`
			`throw new Error("Method not implemented.");`
			`}`

			`authorizeNango(): Promise<void> {`
			`throw new Error("Method not implemented.");`
			`}`

			`private async convertUrlsToDocuments(`
			`urls: string[],`
			`inProgress?: (progress: Progress) => void`
			`): Promise<Document[]> {`
			`const totalUrls = urls.length;`
			`let processedUrls = 0;`
Caleb: trying to get loggin workng 2024-04-30 09:20:15 -07:00
Initial commit 2024-04-15 17:01:47 -04:00			`const results: (Document \| null)[] = new Array(urls.length).fill(null);`
			`for (let i = 0; i < urls.length; i += this.concurrentRequests) {`
			`const batchUrls = urls.slice(i, i + this.concurrentRequests);`
Update index.ts 2024-04-17 12:51:12 -07:00			`await Promise.all(`
			`batchUrls.map(async (url, index) => {`
changed to `includeHtml` 2024-05-06 19:45:56 -03:00			`const result = await scrapSingleUrl(url, this.pageOptions, this.includeHtml);`
Update index.ts 2024-04-17 12:51:12 -07:00			`processedUrls++;`
			`if (inProgress) {`
			`inProgress({`
			`current: processedUrls,`
			`total: totalUrls,`
			`status: "SCRAPING",`
			`currentDocumentUrl: url,`
Nick: partial docs working, cleaner 2024-05-04 12:30:12 -07:00			`currentDocument: result`
Update index.ts 2024-04-17 12:51:12 -07:00			`});`
			`}`
			`results[i + index] = result;`
			`})`
			`);`
Initial commit 2024-04-15 17:01:47 -04:00			`}`
			`return results.filter((result) => result !== null) as Document[];`
			`}`

			`async getDocuments(`
			`useCaching: boolean = false,`
			`inProgress?: (progress: Progress) => void`
			`): Promise<Document[]> {`
Update index.ts 2024-05-04 11:53:16 -07:00			`this.validateInitialUrl();`
Initial commit 2024-04-15 17:01:47 -04:00
Update index.ts 2024-04-18 13:53:11 -03:00			`if (!useCaching) {`
Update index.ts 2024-05-04 11:53:16 -07:00			`return this.processDocumentsWithoutCache(inProgress);`
			`}`
adding option to replace all relative paths with absolute paths 2024-04-19 11:47:20 -03:00
Update index.ts 2024-05-04 11:53:16 -07:00			`return this.processDocumentsWithCache(inProgress);`
			`}`
Initial commit 2024-04-15 17:01:47 -04:00
Update index.ts 2024-05-04 11:53:16 -07:00			`private validateInitialUrl(): void {`
			`if (this.urls[0].trim() === "") {`
			`throw new Error("Url is required");`
			`}`
			`}`
Initial commit 2024-04-15 17:01:47 -04:00
Update index.ts 2024-05-04 12:44:30 -07:00			`/**`
			`* Process documents without cache handling each mode`
			`* @param inProgress inProgress`
			`* @returns documents`
			`*/`
Update index.ts 2024-05-04 11:53:16 -07:00			`private async processDocumentsWithoutCache(inProgress?: (progress: Progress) => void): Promise<Document[]> {`
			`switch (this.mode) {`
			`case "crawl":`
			`return this.handleCrawlMode(inProgress);`
			`case "single_urls":`
			`return this.handleSingleUrlsMode(inProgress);`
			`case "sitemap":`
			`return this.handleSitemapMode(inProgress);`
			`default:`
			`return [];`
			`}`
			`}`
Initial commit 2024-04-15 17:01:47 -04:00
Update index.ts 2024-05-04 11:53:16 -07:00			`private async handleCrawlMode(inProgress?: (progress: Progress) => void): Promise<Document[]> {`
			`const crawler = new WebCrawler({`
			`initialUrl: this.urls[0],`
			`includes: this.includes,`
			`excludes: this.excludes,`
			`maxCrawledLinks: this.maxCrawledLinks,`
			`limit: this.limit,`
			`generateImgAltText: this.generateImgAltText,`
changed to `includeHtml` 2024-05-06 19:45:56 -03:00			`includeHtml: this.includeHtml,`
Update index.ts 2024-05-04 11:53:16 -07:00			`});`
			`let links = await crawler.start(inProgress, 5, this.limit);`
			`if (this.returnOnlyUrls) {`
			`return this.returnOnlyUrlsResponse(links, inProgress);`
			`}`
[Feat] Adding pdf parser 2024-04-18 11:43:57 -03:00
Update index.ts 2024-05-04 11:53:16 -07:00			`let documents = await this.processLinks(links, inProgress);`
			`return this.cacheAndFinalizeDocuments(documents, links);`
			`}`
adding option to replace all relative paths with absolute paths 2024-04-19 11:47:20 -03:00
Update index.ts 2024-05-04 11:53:16 -07:00			`private async handleSingleUrlsMode(inProgress?: (progress: Progress) => void): Promise<Document[]> {`
Nick: partial docs working, cleaner 2024-05-04 12:30:12 -07:00			`let documents = await this.processLinks(this.urls, inProgress);`
Update index.ts 2024-05-04 11:53:16 -07:00			`return documents;`
			`}`
adding option to replace all relative paths with absolute paths 2024-04-19 11:47:20 -03:00
Update index.ts 2024-05-04 11:53:16 -07:00			`private async handleSitemapMode(inProgress?: (progress: Progress) => void): Promise<Document[]> {`
			`let links = await getLinksFromSitemap(this.urls[0]);`
			`if (this.returnOnlyUrls) {`
			`return this.returnOnlyUrlsResponse(links, inProgress);`
			`}`
Update index.ts 2024-04-17 12:51:12 -07:00
Update index.ts 2024-05-04 11:53:16 -07:00			`let documents = await this.processLinks(links, inProgress);`
			`return this.cacheAndFinalizeDocuments(documents, links);`
			`}`
Caleb: got it to a testable state I believe 2024-04-28 15:52:09 -07:00
Update index.ts 2024-05-04 11:53:16 -07:00			`private async returnOnlyUrlsResponse(links: string[], inProgress?: (progress: Progress) => void): Promise<Document[]> {`
			`inProgress?.({`
			`current: links.length,`
			`total: links.length,`
			`status: "COMPLETED",`
			`currentDocumentUrl: this.urls[0],`
			`});`
			`return links.map(url => ({`
			`content: "",`
changed to `includeHtml` 2024-05-06 19:45:56 -03:00			`html: this.includeHtml ? "" : undefined,`
Update index.ts 2024-05-04 11:53:16 -07:00			`markdown: "",`
			`metadata: { sourceURL: url },`
			`}));`
			`}`
[Feat] Adding pdf parser 2024-04-18 11:43:57 -03:00
Update index.ts 2024-05-04 11:53:16 -07:00			`private async processLinks(links: string[], inProgress?: (progress: Progress) => void): Promise<Document[]> {`
			`let pdfLinks = links.filter(link => link.endsWith(".pdf"));`
			`let pdfDocuments = await this.fetchPdfDocuments(pdfLinks);`
			`links = links.filter(link => !link.endsWith(".pdf"));`
Initial commit 2024-04-15 17:01:47 -04:00
Update index.ts 2024-05-04 11:53:16 -07:00			`let documents = await this.convertUrlsToDocuments(links, inProgress);`
			`documents = await this.getSitemapData(this.urls[0], documents);`
			`documents = this.applyPathReplacements(documents);`
			`documents = await this.applyImgAltText(documents);`
Nick: partial docs working, cleaner 2024-05-04 12:30:12 -07:00
			`if(this.extractorOptions.mode === "llm-extraction" && this.mode === "single_urls") {`
			`documents = await generateCompletions(`
			`documents,`
			`this.extractorOptions`
			`)`
			`}`
Update index.ts 2024-05-04 11:53:16 -07:00			`return documents.concat(pdfDocuments);`
			`}`
adding option to replace all relative paths with absolute paths 2024-04-19 11:47:20 -03:00
Update index.ts 2024-05-04 11:53:16 -07:00			`private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> {`
			`return Promise.all(pdfLinks.map(async pdfLink => {`
			`const pdfContent = await fetchAndProcessPdf(pdfLink);`
			`return {`
			`content: pdfContent,`
			`metadata: { sourceURL: pdfLink },`
			`provider: "web-scraper"`
			`};`
			`}));`
			`}`
adding option to replace all relative paths with absolute paths 2024-04-19 11:47:20 -03:00
Update index.ts 2024-05-04 11:53:16 -07:00			`private applyPathReplacements(documents: Document[]): Document[] {`
			`return this.replaceAllPathsWithAbsolutePaths ? replacePathsWithAbsolutePaths(documents) : replaceImgPathsWithAbsolutePaths(documents);`
			`}`
Update index.ts 2024-04-17 12:51:12 -07:00
Update index.ts 2024-05-04 11:53:16 -07:00			`private async applyImgAltText(documents: Document[]): Promise<Document[]> {`
			`return this.generateImgAltText ? this.generatesImgAltText(documents) : documents;`
			`}`
Initial commit 2024-04-15 17:01:47 -04:00
Update index.ts 2024-05-04 11:53:16 -07:00			`private async cacheAndFinalizeDocuments(documents: Document[], links: string[]): Promise<Document[]> {`
			`await this.setCachedDocuments(documents, links);`
			`documents = this.removeChildLinks(documents);`
			`return documents.splice(0, this.limit);`
			`}`
Initial commit 2024-04-15 17:01:47 -04:00
Update index.ts 2024-05-04 11:53:16 -07:00			`private async processDocumentsWithCache(inProgress?: (progress: Progress) => void): Promise<Document[]> {`
			`let documents = await this.getCachedDocuments(this.urls.slice(0, this.limit));`
Initial commit 2024-04-15 17:01:47 -04:00			`if (documents.length < this.limit) {`
Update index.ts 2024-05-04 11:53:16 -07:00			`const newDocuments: Document[] = await this.getDocuments(false, inProgress);`
			`documents = this.mergeNewDocuments(documents, newDocuments);`
Initial commit 2024-04-15 17:01:47 -04:00			`}`
			`documents = this.filterDocsExcludeInclude(documents);`
			`documents = this.removeChildLinks(documents);`
Update index.ts 2024-05-04 11:53:16 -07:00			`return documents.splice(0, this.limit);`
			`}`

			`private mergeNewDocuments(existingDocuments: Document[], newDocuments: Document[]): Document[] {`
			`newDocuments.forEach(doc => {`
			`if (!existingDocuments.some(d => this.normalizeUrl(d.metadata.sourceURL) === this.normalizeUrl(doc.metadata?.sourceURL))) {`
			`existingDocuments.push(doc);`
			`}`
			`});`
			`return existingDocuments;`
Initial commit 2024-04-15 17:01:47 -04:00			`}`

			`private filterDocsExcludeInclude(documents: Document[]): Document[] {`
			`return documents.filter((document) => {`
			`const url = new URL(document.metadata.sourceURL);`
			`const path = url.pathname;`

Update index.ts 2024-04-17 12:51:12 -07:00			`if (this.excludes.length > 0 && this.excludes[0] !== "") {`
Initial commit 2024-04-15 17:01:47 -04:00			`// Check if the link should be excluded`
Update index.ts 2024-04-17 12:51:12 -07:00			`if (`
			`this.excludes.some((excludePattern) =>`
			`new RegExp(excludePattern).test(path)`
			`)`
			`) {`
Initial commit 2024-04-15 17:01:47 -04:00			`return false;`
			`}`
			`}`
Update index.ts 2024-04-17 12:51:12 -07:00
			`if (this.includes.length > 0 && this.includes[0] !== "") {`
Initial commit 2024-04-15 17:01:47 -04:00			`// Check if the link matches the include patterns, if any are specified`
			`if (this.includes.length > 0) {`
Update index.ts 2024-04-17 12:51:12 -07:00			`return this.includes.some((includePattern) =>`
			`new RegExp(includePattern).test(path)`
			`);`
Initial commit 2024-04-15 17:01:47 -04:00			`}`
			`}`
			`return true;`
			`});`
			`}`

			`private normalizeUrl(url: string): string {`
			`if (url.includes("//www.")) {`
			`return url.replace("//www.", "//");`
			`}`
			`return url;`
			`}`

			`private removeChildLinks(documents: Document[]): Document[] {`
			`for (let document of documents) {`
			`if (document?.childrenLinks) delete document.childrenLinks;`
Update index.ts 2024-04-17 12:51:12 -07:00			`}`
Initial commit 2024-04-15 17:01:47 -04:00			`return documents;`
			`}`

			`async setCachedDocuments(documents: Document[], childrenLinks?: string[]) {`
			`for (const document of documents) {`
			`if (document.content.trim().length === 0) {`
			`continue;`
			`}`
			`const normalizedUrl = this.normalizeUrl(document.metadata.sourceURL);`
Update index.ts 2024-04-17 12:51:12 -07:00			`await setValue(`
			`"web-scraper-cache:" + normalizedUrl,`
			`JSON.stringify({`
			`...document,`
			`childrenLinks: childrenLinks \|\| [],`
			`}),`
			`60 * 60 * 24 * 10`
			`); // 10 days`
Initial commit 2024-04-15 17:01:47 -04:00			`}`
			`}`

			`async getCachedDocuments(urls: string[]): Promise<Document[]> {`
			`let documents: Document[] = [];`
			`for (const url of urls) {`
			`const normalizedUrl = this.normalizeUrl(url);`
Update index.ts 2024-04-17 12:51:12 -07:00			`console.log(`
			`"Getting cached document for web-scraper-cache:" + normalizedUrl`
			`);`
			`const cachedDocumentString = await getValue(`
			`"web-scraper-cache:" + normalizedUrl`
			`);`
Initial commit 2024-04-15 17:01:47 -04:00			`if (cachedDocumentString) {`
			`const cachedDocument = JSON.parse(cachedDocumentString);`
			`documents.push(cachedDocument);`

			`// get children documents`
Nick: partial docs working, cleaner 2024-05-04 12:30:12 -07:00			`for (const childUrl of (cachedDocument.childrenLinks \|\| [])) {`
Initial commit 2024-04-15 17:01:47 -04:00			`const normalizedChildUrl = this.normalizeUrl(childUrl);`
Update index.ts 2024-04-17 12:51:12 -07:00			`const childCachedDocumentString = await getValue(`
			`"web-scraper-cache:" + normalizedChildUrl`
			`);`
Initial commit 2024-04-15 17:01:47 -04:00			`if (childCachedDocumentString) {`
			`const childCachedDocument = JSON.parse(childCachedDocumentString);`
Update index.ts 2024-04-17 12:51:12 -07:00			`if (`
			`!documents.find(`
			`(doc) =>`
			`doc.metadata.sourceURL ===`
			`childCachedDocument.metadata.sourceURL`
			`)`
			`) {`
Initial commit 2024-04-15 17:01:47 -04:00			`documents.push(childCachedDocument);`
			`}`
			`}`
			`}`
			`}`
			`}`
			`return documents;`
			`}`

			`setOptions(options: WebScraperOptions): void {`
			`if (!options.urls) {`
			`throw new Error("Urls are required");`
			`}`

			`this.urls = options.urls;`
			`this.mode = options.mode;`
			`this.concurrentRequests = options.concurrentRequests ?? 20;`
			`this.includes = options.crawlerOptions?.includes ?? [];`
			`this.excludes = options.crawlerOptions?.excludes ?? [];`
			`this.maxCrawledLinks = options.crawlerOptions?.maxCrawledLinks ?? 1000;`
			`this.returnOnlyUrls = options.crawlerOptions?.returnOnlyUrls ?? false;`
			`this.limit = options.crawlerOptions?.limit ?? 10000;`
Update index.ts 2024-04-17 12:51:12 -07:00			`this.generateImgAltText =`
			`options.crawlerOptions?.generateImgAltText ?? false;`
changed to `includeHtml` 2024-05-06 19:45:56 -03:00			`this.pageOptions = options.pageOptions ?? {onlyMainContent: false };`
Caleb: got it to a testable state I believe 2024-04-28 15:52:09 -07:00			`this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}`
adding option to replace all relative paths with absolute paths 2024-04-19 11:47:20 -03:00			`this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;`
changed to `includeHtml` 2024-05-06 19:45:56 -03:00			`this.includeHtml = options?.includeHtml ?? false;`
Initial commit 2024-04-15 17:01:47 -04:00			`//! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check`
Update index.ts 2024-04-17 12:51:12 -07:00			`this.excludes = this.excludes.filter((item) => item !== "");`

Initial commit 2024-04-15 17:01:47 -04:00			`// make sure all urls start with https://`
			`this.urls = this.urls.map((url) => {`
			`if (!url.trim().startsWith("http")) {`
			return `https://${url}`;
			`}`
			`return url;`
			`});`
			`}`

			`private async getSitemapData(baseUrl: string, documents: Document[]) {`
Update index.ts 2024-04-17 12:51:12 -07:00			`const sitemapData = await fetchSitemapData(baseUrl);`
Initial commit 2024-04-15 17:01:47 -04:00			`if (sitemapData) {`
			`for (let i = 0; i < documents.length; i++) {`
Update index.ts 2024-04-17 12:51:12 -07:00			`const docInSitemapData = sitemapData.find(`
			`(data) =>`
			`this.normalizeUrl(data.loc) ===`
			`this.normalizeUrl(documents[i].metadata.sourceURL)`
			`);`
Initial commit 2024-04-15 17:01:47 -04:00			`if (docInSitemapData) {`
			`let sitemapDocData: Partial<SitemapEntry> = {};`
			`if (docInSitemapData.changefreq) {`
			`sitemapDocData.changefreq = docInSitemapData.changefreq;`
			`}`
			`if (docInSitemapData.priority) {`
			`sitemapDocData.priority = Number(docInSitemapData.priority);`
			`}`
			`if (docInSitemapData.lastmod) {`
			`sitemapDocData.lastmod = docInSitemapData.lastmod;`
			`}`
			`if (Object.keys(sitemapDocData).length !== 0) {`
			`documents[i].metadata.sitemap = sitemapDocData;`
			`}`
			`}`
			`}`
			`}`
			`return documents;`
			`}`
Nick: 2024-04-16 12:49:14 -04:00			`generatesImgAltText = async (documents: Document[]): Promise<Document[]> => {`
Update index.ts 2024-04-17 12:51:12 -07:00			`await Promise.all(`
			`documents.map(async (document) => {`
			`const images = document.content.match(/!\[.?\]\((.?)\)/g) \|\| [];`
Nick: 2024-04-16 12:49:14 -04:00
Update index.ts 2024-04-17 12:51:12 -07:00			`await Promise.all(`
			`images.map(async (image: string) => {`
			`let imageUrl = image.match(/\(([^)]+)\)/)[1];`
			`let altText = image.match(/\[(.*?)\]/)[1];`

			`if (`
			`!altText &&`
			`!imageUrl.startsWith("data:image") &&`
			`/\.(png\|jpeg\|gif\|webp)$/.test(imageUrl)`
			`) {`
			`const imageIndex = document.content.indexOf(image);`
			`const contentLength = document.content.length;`
			`let backText = document.content.substring(`
			`imageIndex + image.length,`
			`Math.min(imageIndex + image.length + 1000, contentLength)`
			`);`
			`let frontTextStartIndex = Math.max(imageIndex - 1000, 0);`
			`let frontText = document.content.substring(`
			`frontTextStartIndex,`
			`imageIndex`
			`);`
			`altText = await getImageDescription(`
			`imageUrl,`
			`backText,`
			`frontText`
Resolved merge conflicts between feat/added-anthropic-vision-api and main 2024-04-24 09:57:45 -03:00			`, this.generateImgAltTextModel);`
Update index.ts 2024-04-17 12:51:12 -07:00			`}`

			`document.content = document.content.replace(`
			`image,`
			`![${altText}](${imageUrl})`
			`);`
			`})`
			`);`
			`})`
			`);`
Nick: 2024-04-16 12:49:14 -04:00
			`return documents;`
Update index.ts 2024-04-17 12:51:12 -07:00			`};`
			`}`