77 lines
1.9 KiB
TypeScript
Raw Normal View History

2024-04-30 12:19:43 -07:00
import OpenAI from "openai";
import { Document } from "../../lib/entities";
2024-04-30 12:19:43 -07:00
export type ScraperCompletionResult = {
data: any | null;
url: string;
};
const defaultPrompt =
2024-04-30 12:19:43 -07:00
"You are a professional web scraper. Extract the contents of the webpage";
function prepareOpenAIDoc(
document: Document
): OpenAI.Chat.Completions.ChatCompletionContentPart[] {
// Check if the markdown content exists in the document
if (!document.markdown) {
2024-04-30 16:19:32 -07:00
throw new Error(
"Markdown content is missing in the document. This is likely due to an error in the scraping process. Please try again or reach out to help@mendable.ai"
);
}
2024-04-30 12:19:43 -07:00
return [{ type: "text", text: document.markdown }];
}
export async function generateOpenAICompletions({
client,
2024-04-30 12:19:43 -07:00
model = "gpt-4-turbo",
document,
schema, //TODO - add zod dynamic type checking
prompt = defaultPrompt,
2024-04-30 12:19:43 -07:00
temperature,
}: {
2024-04-30 12:19:43 -07:00
client: OpenAI;
model?: string;
document: Document;
schema: any; // This should be replaced with a proper Zod schema type when available
prompt?: string;
temperature?: number;
}): Promise<Document> {
2024-04-30 12:19:43 -07:00
const openai = client as OpenAI;
const content = prepareOpenAIDoc(document);
2024-04-30 09:20:15 -07:00
const completion = await openai.chat.completions.create({
model,
messages: [
{
2024-04-30 12:19:43 -07:00
role: "system",
content: prompt,
},
2024-04-30 12:19:43 -07:00
{ role: "user", content },
],
tools: [
{
2024-04-30 12:19:43 -07:00
type: "function",
function: {
2024-04-30 12:19:43 -07:00
name: "extract_content",
description: "Extracts the content from the given webpage(s)",
parameters: schema,
},
},
],
2024-04-30 12:19:43 -07:00
tool_choice: "auto",
temperature,
2024-04-30 12:19:43 -07:00
});
const c = completion.choices[0].message.tool_calls[0].function.arguments;
// Extract the LLM extraction content from the completion response
2024-04-28 19:28:28 -07:00
const llmExtraction = JSON.parse(c);
// Return the document with the LLM extraction content added
return {
...document,
2024-04-30 12:19:43 -07:00
llm_extraction: llmExtraction,
};
}