crawl4ai/docs/examples/llm_extraction_openai_pricing.py

from crawl4ai import LLMConfig
from crawl4ai import AsyncWebCrawler, LLMExtractionStrategy
import asyncio
import os
import json
from pydantic import BaseModel, Field

url = "https://openai.com/api/pricing/"


class OpenAIModelFee(BaseModel):
    model_name: str = Field(..., description="Name of the OpenAI model.")
    input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
    output_fee: str = Field(
        ..., description="Fee for output token for the OpenAI model."
    )

async def main():
    # Use AsyncWebCrawler
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(
            url=url,
            word_count_threshold=1,
            extraction_strategy=LLMExtractionStrategy(
                # provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'),
                llm_config=LLMConfig(provider="groq/llama-3.1-70b-versatile", api_token=os.getenv("GROQ_API_KEY")),
                schema=OpenAIModelFee.model_json_schema(),
                extraction_type="schema",
                instruction="From the crawled content, extract all mentioned model names along with their "
                "fees for input and output tokens. Make sure not to miss anything in the entire content. "
                "One extracted model JSON format should look like this: "
                '{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }',
            ),
        )
        print("Success:", result.success)
        model_fees = json.loads(result.extracted_content)
        print(len(model_fees))

        with open(".data/data.json", "w", encoding="utf-8") as f:
            f.write(result.extracted_content)


asyncio.run(main())