2024-04-30 16:19:32 -07:00

77 lines
1.9 KiB
TypeScript

import OpenAI from "openai";
import { Document } from "../../lib/entities";
export type ScraperCompletionResult = {
data: any | null;
url: string;
};
const defaultPrompt =
"You are a professional web scraper. Extract the contents of the webpage";
function prepareOpenAIDoc(
document: Document
): OpenAI.Chat.Completions.ChatCompletionContentPart[] {
// Check if the markdown content exists in the document
if (!document.markdown) {
throw new Error(
"Markdown content is missing in the document. This is likely due to an error in the scraping process. Please try again or reach out to help@mendable.ai"
);
}
return [{ type: "text", text: document.markdown }];
}
export async function generateOpenAICompletions({
client,
model = "gpt-4-turbo",
document,
schema, //TODO - add zod dynamic type checking
prompt = defaultPrompt,
temperature,
}: {
client: OpenAI;
model?: string;
document: Document;
schema: any; // This should be replaced with a proper Zod schema type when available
prompt?: string;
temperature?: number;
}): Promise<Document> {
const openai = client as OpenAI;
const content = prepareOpenAIDoc(document);
const completion = await openai.chat.completions.create({
model,
messages: [
{
role: "system",
content: prompt,
},
{ role: "user", content },
],
tools: [
{
type: "function",
function: {
name: "extract_content",
description: "Extracts the content from the given webpage(s)",
parameters: schema,
},
},
],
tool_choice: "auto",
temperature,
});
const c = completion.choices[0].message.tool_calls[0].function.arguments;
// Extract the LLM extraction content from the completion response
const llmExtraction = JSON.parse(c);
// Return the document with the LLM extraction content added
return {
...document,
llm_extraction: llmExtraction,
};
}