firecrawl/apps/api/src/lib/LLM-extraction/models.ts

import OpenAI from 'openai'
import { z } from 'zod'
import { Document, ExtractorOptions } from "../../lib/entities";
import { numTokensFromString } from './helpers';

// import {
//   LlamaModel,
//   LlamaJsonSchemaGrammar,
//   LlamaContext,
//   LlamaChatSession,
//   GbnfJsonSchema,
// } from 'node-llama-cpp'
// import { JsonSchema7Type } from 'zod-to-json-schema'

export type ScraperCompletionResult<T extends z.ZodSchema<any>> = {
  data: any | null
  url: string
}

const defaultPrompt =
  'You are a professional web scraper. Extract the contents of the webpage'

function prepareOpenAIDoc(
  document: Document
): OpenAI.Chat.Completions.ChatCompletionContentPart[] {

  // Check if the markdown content exists in the document
  if (!document.markdown) {
    throw new Error("Markdown content is missing in the document.");
  }

  return [{ type: 'text', text: document.markdown}]
}

export async function generateOpenAICompletions({
  client,
  model = 'gpt-4-turbo',
  document,
  schema, //TODO - add zod dynamic type checking
  prompt = defaultPrompt,
  temperature
}: {
  client: OpenAI,
  model?: string,
  document: Document,
  schema: any, // This should be replaced with a proper Zod schema type when available
  prompt?: string,
  temperature?: number
}): Promise<Document> {
  const openai = client as OpenAI
  const content = prepareOpenAIDoc(document)


  const completion = await openai.chat.completions.create({
    model,
    messages: [
      {
        role: 'system',
        content: prompt,
      },
      { role: 'user', content },
    ],
    tools: [
      {
        type: 'function',
        function: {
          name: 'extract_content',
          description: 'Extracts the content from the given webpage(s)',
          parameters: schema,
        },
      },
    ],
    tool_choice: 'auto',
    temperature,
  })

  const c = completion.choices[0].message.tool_calls[0].function.arguments
  
  // Extract the LLM extraction content from the completion response
  const llmExtraction = JSON.parse(c);

//   console.log("llm extraction: ", llmExtraction);


  // Return the document with the LLM extraction content added
  return {
    ...document,
    llm_extraction: llmExtraction
  };
   
}

// export async function generateLlamaCompletions<T extends z.ZodSchema<any>>(
//   model: LlamaModel,
//   page: ScraperLoadResult,
//   schema: JsonSchema7Type,
//   prompt: string = defaultPrompt,
//   temperature?: number
// ): Promise<ScraperCompletionResult<T>> {
//   const grammar = new LlamaJsonSchemaGrammar(schema as GbnfJsonSchema) as any // any, because it has weird type inference going on
//   const context = new LlamaContext({ model })
//   const session = new LlamaChatSession({ context })
//   const pagePrompt = `${prompt}\n${page.content}`

//   const result = await session.prompt(pagePrompt, {
//     grammar,
//     temperature,
//   })

//   const parsed = grammar.parse(result)
//   return {
//     data: parsed,
//     url: page.url,
//   }
// }
Caleb: initially pulled inspiration code from https://github.com/mishushakov/llm-scraper 2024-04-28 13:59:35 -07:00			`import OpenAI from 'openai'`
			`import { z } from 'zod'`
Caleb: got it to a testable state I believe 2024-04-28 15:52:09 -07:00			`import { Document, ExtractorOptions } from "../../lib/entities";`
Caleb: trying to get loggin workng 2024-04-30 09:20:15 -07:00			`import { numTokensFromString } from './helpers';`
Caleb: got it to a testable state I believe 2024-04-28 15:52:09 -07:00
Caleb: initially pulled inspiration code from https://github.com/mishushakov/llm-scraper 2024-04-28 13:59:35 -07:00			`// import {`
			`// LlamaModel,`
			`// LlamaJsonSchemaGrammar,`
			`// LlamaContext,`
			`// LlamaChatSession,`
			`// GbnfJsonSchema,`
			`// } from 'node-llama-cpp'`
Caleb: got it to a testable state I believe 2024-04-28 15:52:09 -07:00			`// import { JsonSchema7Type } from 'zod-to-json-schema'`
Caleb: initially pulled inspiration code from https://github.com/mishushakov/llm-scraper 2024-04-28 13:59:35 -07:00
			`export type ScraperCompletionResult<T extends z.ZodSchema<any>> = {`
Caleb: got it to a testable state I believe 2024-04-28 15:52:09 -07:00			`data: any \| null`
Caleb: initially pulled inspiration code from https://github.com/mishushakov/llm-scraper 2024-04-28 13:59:35 -07:00			`url: string`
			`}`

			`const defaultPrompt =`
Caleb: trying to get loggin workng 2024-04-30 09:20:15 -07:00			`'You are a professional web scraper. Extract the contents of the webpage'`
Caleb: initially pulled inspiration code from https://github.com/mishushakov/llm-scraper 2024-04-28 13:59:35 -07:00
Caleb: got it to a testable state I believe 2024-04-28 15:52:09 -07:00			`function prepareOpenAIDoc(`
			`document: Document`
Caleb: initially pulled inspiration code from https://github.com/mishushakov/llm-scraper 2024-04-28 13:59:35 -07:00			`): OpenAI.Chat.Completions.ChatCompletionContentPart[] {`
Caleb: got it to a testable state I believe 2024-04-28 15:52:09 -07:00
			`// Check if the markdown content exists in the document`
			`if (!document.markdown) {`
			`throw new Error("Markdown content is missing in the document.");`
Caleb: initially pulled inspiration code from https://github.com/mishushakov/llm-scraper 2024-04-28 13:59:35 -07:00			`}`

Caleb: switched back to markdown for extraction 2024-04-30 10:23:12 -07:00			`return [{ type: 'text', text: document.markdown}]`
Caleb: initially pulled inspiration code from https://github.com/mishushakov/llm-scraper 2024-04-28 13:59:35 -07:00			`}`

Caleb: added ajv json schema validation. 2024-04-29 12:12:55 -07:00			`export async function generateOpenAICompletions({`
Caleb: got it to a testable state I believe 2024-04-28 15:52:09 -07:00			`client,`
Caleb: trying to get loggin workng 2024-04-30 09:20:15 -07:00			`model = 'gpt-4-turbo',`
Caleb: got it to a testable state I believe 2024-04-28 15:52:09 -07:00			`document,`
			`schema, //TODO - add zod dynamic type checking`
			`prompt = defaultPrompt,`
			`temperature`
			`}: {`
Caleb: initially pulled inspiration code from https://github.com/mishushakov/llm-scraper 2024-04-28 13:59:35 -07:00			`client: OpenAI,`
Caleb: got it to a testable state I believe 2024-04-28 15:52:09 -07:00			`model?: string,`
			`document: Document,`
			`schema: any, // This should be replaced with a proper Zod schema type when available`
			`prompt?: string,`
Caleb: initially pulled inspiration code from https://github.com/mishushakov/llm-scraper 2024-04-28 13:59:35 -07:00			`temperature?: number`
Caleb: got it to a testable state I believe 2024-04-28 15:52:09 -07:00			`}): Promise<Document> {`
Caleb: initially pulled inspiration code from https://github.com/mishushakov/llm-scraper 2024-04-28 13:59:35 -07:00			`const openai = client as OpenAI`
Caleb: got it to a testable state I believe 2024-04-28 15:52:09 -07:00			`const content = prepareOpenAIDoc(document)`
Caleb: initially pulled inspiration code from https://github.com/mishushakov/llm-scraper 2024-04-28 13:59:35 -07:00
Caleb: trying to get loggin workng 2024-04-30 09:20:15 -07:00
Caleb: initially pulled inspiration code from https://github.com/mishushakov/llm-scraper 2024-04-28 13:59:35 -07:00			`const completion = await openai.chat.completions.create({`
			`model,`
			`messages: [`
			`{`
			`role: 'system',`
			`content: prompt,`
			`},`
			`{ role: 'user', content },`
			`],`
			`tools: [`
			`{`
			`type: 'function',`
			`function: {`
			`name: 'extract_content',`
			`description: 'Extracts the content from the given webpage(s)',`
			`parameters: schema,`
			`},`
			`},`
			`],`
			`tool_choice: 'auto',`
			`temperature,`
			`})`

			`const c = completion.choices[0].message.tool_calls[0].function.arguments`
Caleb: got it to a testable state I believe 2024-04-28 15:52:09 -07:00
			`// Extract the LLM extraction content from the completion response`
Caleb: converted llm response to json 2024-04-28 19:28:28 -07:00			`const llmExtraction = JSON.parse(c);`

Caleb: trying to get loggin workng 2024-04-30 09:20:15 -07:00			`// console.log("llm extraction: ", llmExtraction);`

Caleb: got it to a testable state I believe 2024-04-28 15:52:09 -07:00
			`// Return the document with the LLM extraction content added`
Caleb: initially pulled inspiration code from https://github.com/mishushakov/llm-scraper 2024-04-28 13:59:35 -07:00			`return {`
Caleb: got it to a testable state I believe 2024-04-28 15:52:09 -07:00			`...document,`
			`llm_extraction: llmExtraction`
			`};`

Caleb: initially pulled inspiration code from https://github.com/mishushakov/llm-scraper 2024-04-28 13:59:35 -07:00			`}`

			`// export async function generateLlamaCompletions<T extends z.ZodSchema<any>>(`
			`// model: LlamaModel,`
			`// page: ScraperLoadResult,`
			`// schema: JsonSchema7Type,`
			`// prompt: string = defaultPrompt,`
			`// temperature?: number`
			`// ): Promise<ScraperCompletionResult<T>> {`
			`// const grammar = new LlamaJsonSchemaGrammar(schema as GbnfJsonSchema) as any // any, because it has weird type inference going on`
			`// const context = new LlamaContext({ model })`
			`// const session = new LlamaChatSession({ context })`
			// const pagePrompt = `${prompt}\n${page.content}`

			`// const result = await session.prompt(pagePrompt, {`
			`// grammar,`
			`// temperature,`
			`// })`

			`// const parsed = grammar.parse(result)`
			`// return {`
			`// data: parsed,`
			`// url: page.url,`
			`// }`
			`// }`