firecrawl/apps/api/src/controllers/v1/scrape.ts

import { Request, Response } from "express";
import { Logger } from '../../lib/logger';
import { checkAndUpdateURL } from '../../lib/validateUrl';
import { RequestWithAuth, ScrapeRequest, scrapeRequestSchema, ScrapeResponse } from "./types";

export async function scrapeController(req: RequestWithAuth<ScrapeResponse, ScrapeRequest>, res: Response<ScrapeResponse>) {
  req.body = scrapeRequestSchema.parse(req.body);
  console.log(req.body);

  // TODO: check req.body
  // mockup req.body
  // req.body = {
  //   url: "test",
  //   headers: {
  //     "x-key": "test"
  //   },
  //   formats: ["markdown", "html", "rawHtml", "content", "linksOnPage", "screenshot", "fullPageScreenshot"],
  //   includeTags: ["test"],
  //   excludeTags: ["test"],
  //   onlyMainContent: false,
  //   timeout: 30000,
  //   waitFor: number
  // }

  let earlyReturn = false;
  // make sure to authenticate user first, Bearer <token>

  // check credits

  const result: ScrapeResponse = {
    success: true,
    warning: "test",
    data: {
      markdown: "test",
      html: "test",
      rawHtml: "test",
      links: ["test1", "test2"],
      screenshot: "test",
      metadata: {
        title: "test",
        description: "test",
        language: "test",
        sourceURL: "test",
        statusCode: 200,
        error: "test"
      }
    }
  }

  return res.status(200).json(result);

  // const crawlerOptions = req.body.crawlerOptions ?? {};
  // const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions };
  // const extractorOptions = { ...defaultExtractorOptions, ...req.body.extractorOptions };
  // const origin = req.body.origin ?? defaultOrigin;
  // let timeout = req.body.timeout ?? defaultTimeout;

  // if (extractorOptions.mode.includes("llm-extraction")) {
  //   pageOptions.onlyMainContent = true;
  //   timeout = req.body.timeout ?? 90000;
  // }

  // const checkCredits = async () => {
  //   try {
  //     const { success: creditsCheckSuccess, message: creditsCheckMessage } = await checkTeamCredits(team_id, 1);
  //     if (!creditsCheckSuccess) {
  //       earlyReturn = true;
  //       return res.status(402).json({ error: "Insufficient credits" });
  //     }
  //   } catch (error) {
  //     Logger.error(error);
  //     earlyReturn = true;
  //     return res.status(500).json({ error: "Error checking team credits. Please contact hello@firecrawl.com for help." });
  //   }
  // };


  // await checkCredits();

  // const jobId = uuidv4();

  // const startTime = new Date().getTime();
  // const result = await scrapeHelper(
  //   jobId,
  //   req,
  //   team_id,
  //   crawlerOptions,
  //   pageOptions,
  //   extractorOptions,
  //   timeout,
  //   plan
  // );
  // const endTime = new Date().getTime();
  // const timeTakenInSeconds = (endTime - startTime) / 1000;
  // const numTokens = (result.data && result.data.markdown) ? numTokensFromString(result.data.markdown, "gpt-3.5-turbo") : 0;

  // if (result.success) {
  //   let creditsToBeBilled = 1; // Assuming 1 credit per document
  //   const creditsPerLLMExtract = 50;

  //   if (extractorOptions.mode.includes("llm-extraction")) {
  //     // creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length);
  //     creditsToBeBilled += creditsPerLLMExtract;
  //   }

  //   let startTimeBilling = new Date().getTime();

  //   if (earlyReturn) {
  //     // Don't bill if we're early returning
  //     return;
  //   }
  //   const billingResult = await billTeam(
  //     team_id,
  //     creditsToBeBilled
  //   );
  //   if (!billingResult.success) {
  //     return res.status(402).json({
  //       success: false,
  //       error: "Failed to bill team. Insufficient credits or subscription not found.",
  //     });
  //   }
  // }

  // logJob({
  //   job_id: jobId,
  //   success: result.success,
  //   message: result.error,
  //   num_docs: 1,
  //   docs: [result.data],
  //   time_taken: timeTakenInSeconds,
  //   team_id: team_id,
  //   mode: "scrape",
  //   url: req.body.url,
  //   crawlerOptions: crawlerOptions,
  //   pageOptions: pageOptions,
  //   origin: origin,
  //   extractor_options: extractorOptions,
  //   num_tokens: numTokens,
  // });


  // return res.status(result.returnCode).json(result);
}


// export async function scrapeHelper(
//   jobId: string,
//   req: Request,
//   team_id: string,
//   crawlerOptions: any,
//   pageOptions: PageOptions,
//   extractorOptions: ExtractorOptions,
//   timeout: number,
//   plan?: string
// ): Promise<{
//   success: boolean;
//   error?: string;
//   data?: Document;
//   returnCode: number;
// }> {

  // const url = req.body.url;
  // if (!url) {
  //   return { success: false, error: "Url is required", returnCode: 400 };
  // }

  // if (isUrlBlocked(url)) {
  //   return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 };
  // }

  // const a = new WebScraperDataProvider();
  // await a.setOptions({
  //   jobId,
  //   mode: "single_urls",
  //   urls: [url],
  //   crawlerOptions: {
  //     ...crawlerOptions,
  //   },
  //   pageOptions: pageOptions,
  //   extractorOptions: extractorOptions,
  // });

  // const timeoutPromise = new Promise<{ success: boolean; error?: string; returnCode: number }>((_, reject) =>
  //   setTimeout(() => reject({ success: false, error: "Request timed out. Increase the timeout by passing `timeout` param to the request.", returnCode: 408 }), timeout)
  // );

  // const docsPromise = a.getDocuments(false);

  // let docs;
  // try {
  //   docs = await Promise.race([docsPromise, timeoutPromise]);
  // } catch (error) {
  //   return error;
  // }

  // // make sure doc.content is not empty
  // let filteredDocs = docs.filter(
  //   (doc: { content?: string }) => doc.content && doc.content.trim().length > 0
  // );
  // if (filteredDocs.length === 0) {
  //   return { success: true, error: "No page found", returnCode: 200, data: docs[0] };
  // }


  // // Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html
  // if (!pageOptions.includeRawHtml && extractorOptions.mode == "llm-extraction-from-raw-html") {
  //   filteredDocs.forEach(doc => {
  //     delete doc.rawHtml;
  //   });
  // }

  // return {
  //   success: true,
  //   data: filteredDocs[0],
  //   returnCode: 200,
  // };
// }