2024-04-30 12:19:43 -07:00
import OpenAI from "openai" ;
import { Document } from "../../lib/entities" ;
2024-04-28 15:52:09 -07:00
2024-04-30 12:19:43 -07:00
export type ScraperCompletionResult = {
data : any | null ;
url : string ;
} ;
2024-04-28 13:59:35 -07:00
const defaultPrompt =
2024-04-30 12:19:43 -07:00
"You are a professional web scraper. Extract the contents of the webpage" ;
2024-04-28 13:59:35 -07:00
2024-04-28 15:52:09 -07:00
function prepareOpenAIDoc (
document : Document
2024-04-28 13:59:35 -07:00
) : OpenAI . Chat . Completions . ChatCompletionContentPart [ ] {
2024-04-28 15:52:09 -07:00
// Check if the markdown content exists in the document
if ( ! document . markdown ) {
2024-04-30 16:19:32 -07:00
throw new Error (
"Markdown content is missing in the document. This is likely due to an error in the scraping process. Please try again or reach out to help@mendable.ai"
) ;
2024-04-28 13:59:35 -07:00
}
2024-04-30 12:19:43 -07:00
return [ { type : "text" , text : document.markdown } ] ;
2024-04-28 13:59:35 -07:00
}
2024-04-29 12:12:55 -07:00
export async function generateOpenAICompletions ( {
2024-04-28 15:52:09 -07:00
client ,
2024-04-30 12:19:43 -07:00
model = "gpt-4-turbo" ,
2024-04-28 15:52:09 -07:00
document ,
schema , //TODO - add zod dynamic type checking
prompt = defaultPrompt ,
2024-04-30 12:19:43 -07:00
temperature ,
2024-04-28 15:52:09 -07:00
} : {
2024-04-30 12:19:43 -07:00
client : OpenAI ;
model? : string ;
document : Document ;
schema : any ; // This should be replaced with a proper Zod schema type when available
prompt? : string ;
temperature? : number ;
2024-04-28 15:52:09 -07:00
} ) : Promise < Document > {
2024-04-30 12:19:43 -07:00
const openai = client as OpenAI ;
const content = prepareOpenAIDoc ( document ) ;
2024-04-30 09:20:15 -07:00
2024-04-28 13:59:35 -07:00
const completion = await openai . chat . completions . create ( {
model ,
messages : [
{
2024-04-30 12:19:43 -07:00
role : "system" ,
2024-04-28 13:59:35 -07:00
content : prompt ,
} ,
2024-04-30 12:19:43 -07:00
{ role : "user" , content } ,
2024-04-28 13:59:35 -07:00
] ,
tools : [
{
2024-04-30 12:19:43 -07:00
type : "function" ,
2024-04-28 13:59:35 -07:00
function : {
2024-04-30 12:19:43 -07:00
name : "extract_content" ,
description : "Extracts the content from the given webpage(s)" ,
2024-04-28 13:59:35 -07:00
parameters : schema ,
} ,
} ,
] ,
2024-04-30 12:19:43 -07:00
tool_choice : "auto" ,
2024-04-28 13:59:35 -07:00
temperature ,
2024-04-30 12:19:43 -07:00
} ) ;
const c = completion . choices [ 0 ] . message . tool_calls [ 0 ] . function . arguments ;
2024-04-28 13:59:35 -07:00
2024-04-28 15:52:09 -07:00
// Extract the LLM extraction content from the completion response
2024-04-28 19:28:28 -07:00
const llmExtraction = JSON . parse ( c ) ;
2024-04-28 15:52:09 -07:00
// Return the document with the LLM extraction content added
2024-04-28 13:59:35 -07:00
return {
2024-04-28 15:52:09 -07:00
. . . document ,
2024-04-30 12:19:43 -07:00
llm_extraction : llmExtraction ,
2024-04-28 15:52:09 -07:00
} ;
2024-04-28 13:59:35 -07:00
}