2024-04-28 13:59:35 -07:00
|
|
|
import OpenAI from 'openai'
|
|
|
|
import { z } from 'zod'
|
2024-04-28 15:52:09 -07:00
|
|
|
import { Document, ExtractorOptions } from "../../lib/entities";
|
2024-04-30 09:20:15 -07:00
|
|
|
import { numTokensFromString } from './helpers';
|
2024-04-28 15:52:09 -07:00
|
|
|
|
2024-04-28 13:59:35 -07:00
|
|
|
// import {
|
|
|
|
// LlamaModel,
|
|
|
|
// LlamaJsonSchemaGrammar,
|
|
|
|
// LlamaContext,
|
|
|
|
// LlamaChatSession,
|
|
|
|
// GbnfJsonSchema,
|
|
|
|
// } from 'node-llama-cpp'
|
2024-04-28 15:52:09 -07:00
|
|
|
// import { JsonSchema7Type } from 'zod-to-json-schema'
|
2024-04-28 13:59:35 -07:00
|
|
|
|
|
|
|
export type ScraperCompletionResult<T extends z.ZodSchema<any>> = {
|
2024-04-28 15:52:09 -07:00
|
|
|
data: any | null
|
2024-04-28 13:59:35 -07:00
|
|
|
url: string
|
|
|
|
}
|
|
|
|
|
|
|
|
const defaultPrompt =
|
2024-04-30 09:20:15 -07:00
|
|
|
'You are a professional web scraper. Extract the contents of the webpage'
|
2024-04-28 13:59:35 -07:00
|
|
|
|
2024-04-28 15:52:09 -07:00
|
|
|
function prepareOpenAIDoc(
|
|
|
|
document: Document
|
2024-04-28 13:59:35 -07:00
|
|
|
): OpenAI.Chat.Completions.ChatCompletionContentPart[] {
|
2024-04-28 15:52:09 -07:00
|
|
|
|
|
|
|
// Check if the markdown content exists in the document
|
|
|
|
if (!document.markdown) {
|
|
|
|
throw new Error("Markdown content is missing in the document.");
|
2024-04-28 13:59:35 -07:00
|
|
|
}
|
|
|
|
|
2024-04-30 10:23:12 -07:00
|
|
|
return [{ type: 'text', text: document.markdown}]
|
2024-04-28 13:59:35 -07:00
|
|
|
}
|
|
|
|
|
2024-04-29 12:12:55 -07:00
|
|
|
export async function generateOpenAICompletions({
|
2024-04-28 15:52:09 -07:00
|
|
|
client,
|
2024-04-30 09:20:15 -07:00
|
|
|
model = 'gpt-4-turbo',
|
2024-04-28 15:52:09 -07:00
|
|
|
document,
|
|
|
|
schema, //TODO - add zod dynamic type checking
|
|
|
|
prompt = defaultPrompt,
|
|
|
|
temperature
|
|
|
|
}: {
|
2024-04-28 13:59:35 -07:00
|
|
|
client: OpenAI,
|
2024-04-28 15:52:09 -07:00
|
|
|
model?: string,
|
|
|
|
document: Document,
|
|
|
|
schema: any, // This should be replaced with a proper Zod schema type when available
|
|
|
|
prompt?: string,
|
2024-04-28 13:59:35 -07:00
|
|
|
temperature?: number
|
2024-04-28 15:52:09 -07:00
|
|
|
}): Promise<Document> {
|
2024-04-28 13:59:35 -07:00
|
|
|
const openai = client as OpenAI
|
2024-04-28 15:52:09 -07:00
|
|
|
const content = prepareOpenAIDoc(document)
|
2024-04-28 13:59:35 -07:00
|
|
|
|
2024-04-30 09:20:15 -07:00
|
|
|
|
2024-04-28 13:59:35 -07:00
|
|
|
const completion = await openai.chat.completions.create({
|
|
|
|
model,
|
|
|
|
messages: [
|
|
|
|
{
|
|
|
|
role: 'system',
|
|
|
|
content: prompt,
|
|
|
|
},
|
|
|
|
{ role: 'user', content },
|
|
|
|
],
|
|
|
|
tools: [
|
|
|
|
{
|
|
|
|
type: 'function',
|
|
|
|
function: {
|
|
|
|
name: 'extract_content',
|
|
|
|
description: 'Extracts the content from the given webpage(s)',
|
|
|
|
parameters: schema,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
],
|
|
|
|
tool_choice: 'auto',
|
|
|
|
temperature,
|
|
|
|
})
|
|
|
|
|
|
|
|
const c = completion.choices[0].message.tool_calls[0].function.arguments
|
2024-04-28 15:52:09 -07:00
|
|
|
|
|
|
|
// Extract the LLM extraction content from the completion response
|
2024-04-28 19:28:28 -07:00
|
|
|
const llmExtraction = JSON.parse(c);
|
|
|
|
|
2024-04-30 09:20:15 -07:00
|
|
|
// console.log("llm extraction: ", llmExtraction);
|
|
|
|
|
2024-04-28 15:52:09 -07:00
|
|
|
|
|
|
|
// Return the document with the LLM extraction content added
|
2024-04-28 13:59:35 -07:00
|
|
|
return {
|
2024-04-28 15:52:09 -07:00
|
|
|
...document,
|
|
|
|
llm_extraction: llmExtraction
|
|
|
|
};
|
|
|
|
|
2024-04-28 13:59:35 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
// export async function generateLlamaCompletions<T extends z.ZodSchema<any>>(
|
|
|
|
// model: LlamaModel,
|
|
|
|
// page: ScraperLoadResult,
|
|
|
|
// schema: JsonSchema7Type,
|
|
|
|
// prompt: string = defaultPrompt,
|
|
|
|
// temperature?: number
|
|
|
|
// ): Promise<ScraperCompletionResult<T>> {
|
|
|
|
// const grammar = new LlamaJsonSchemaGrammar(schema as GbnfJsonSchema) as any // any, because it has weird type inference going on
|
|
|
|
// const context = new LlamaContext({ model })
|
|
|
|
// const session = new LlamaChatSession({ context })
|
|
|
|
// const pagePrompt = `${prompt}\n${page.content}`
|
|
|
|
|
|
|
|
// const result = await session.prompt(pagePrompt, {
|
|
|
|
// grammar,
|
|
|
|
// temperature,
|
|
|
|
// })
|
|
|
|
|
|
|
|
// const parsed = grammar.parse(result)
|
|
|
|
// return {
|
|
|
|
// data: parsed,
|
|
|
|
// url: page.url,
|
|
|
|
// }
|
|
|
|
// }
|