crawl4ai/docs/examples/llm_extraction_openai_pricing.py

import os
import time
from crawl4ai.web_crawler import WebCrawler
from crawl4ai.chunking_strategy import *
from crawl4ai.extraction_strategy import *
from crawl4ai.crawler_strategy import *

url = r'https://openai.com/api/pricing/'

crawler = WebCrawler()
crawler.warmup()

from pydantic import BaseModel, Field

class OpenAIModelFee(BaseModel):
    model_name: str = Field(..., description="Name of the OpenAI model.")
    input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
    output_fee: str = Field(..., description="Fee for output token for the OpenAI model.")

result = crawler.run(
    url=url,
    word_count_threshold=1,
    extraction_strategy= LLMExtractionStrategy(
        # provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'), 
        provider= "groq/llama-3.1-70b-versatile", api_token = os.getenv('GROQ_API_KEY'), 
        schema=OpenAIModelFee.model_json_schema(),
        extraction_type="schema",
        instruction="From the crawled content, extract all mentioned model names along with their "\
            "fees for input and output tokens. Make sure not to miss anything in the entire content. "\
            'One extracted model JSON format should look like this: '\
            '{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }'
    ),
    bypass_cache=True,
)

model_fees = json.loads(result.extracted_content)

print(len(model_fees))

with open(".data/data.json", "w", encoding="utf-8") as f:
    f.write(result.extracted_content)
chore: Update configuration values for chunk token threshold, overlap rate, and minimum word threshold. Create a new example for LLMExtraction Strategy, update Dockerfile, and README 2024-06-19 18:32:20 +08:00			`import os`
			`import time`
			`from crawl4ai.web_crawler import WebCrawler`
			`from crawl4ai.chunking_strategy import *`
			`from crawl4ai.extraction_strategy import *`
			`from crawl4ai.crawler_strategy import *`

			`url = r'https://openai.com/api/pricing/'`

			`crawler = WebCrawler()`
			`crawler.warmup()`

			`from pydantic import BaseModel, Field`

			`class OpenAIModelFee(BaseModel):`
			`model_name: str = Field(..., description="Name of the OpenAI model.")`
			`input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")`
			`output_fee: str = Field(..., description="Fee for output token for the OpenAI model.")`

			`result = crawler.run(`
			`url=url,`
			`word_count_threshold=1,`
			`extraction_strategy= LLMExtractionStrategy(`
refactor: Update image description minimum word threshold in get_content_of_website_optimized 2024-08-02 15:55:32 +08:00			`# provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'),`
			`provider= "groq/llama-3.1-70b-versatile", api_token = os.getenv('GROQ_API_KEY'),`
chore: Update configuration values for chunk token threshold, overlap rate, and minimum word threshold. Create a new example for LLMExtraction Strategy, update Dockerfile, and README 2024-06-19 18:32:20 +08:00			`schema=OpenAIModelFee.model_json_schema(),`
			`extraction_type="schema",`
			`instruction="From the crawled content, extract all mentioned model names along with their "\`
			`"fees for input and output tokens. Make sure not to miss anything in the entire content. "\`
			`'One extracted model JSON format should look like this: '\`
			`'{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }'`
			`),`
			`bypass_cache=True,`
			`)`

			`model_fees = json.loads(result.extracted_content)`

			`print(len(model_fees))`

## [v0.2.74] - 2024-07-08 A slew of exciting updates to improve the crawler's stability and robustness! 🎉 - 💻 UTF encoding fix: Resolved the Windows \"charmap\" error by adding UTF encoding. - 🛡️ Error handling: Implemented MaxRetryError exception handling in LocalSeleniumCrawlerStrategy. - 🧹 Input sanitization: Improved input sanitization and handled encoding issues in LLMExtractionStrategy. - 🚮 Database cleanup: Removed existing database file and initialized a new one. 2024-07-08 16:33:25 +08:00			`with open(".data/data.json", "w", encoding="utf-8") as f:`
chore: Update configuration values for chunk token threshold, overlap rate, and minimum word threshold. Create a new example for LLMExtraction Strategy, update Dockerfile, and README 2024-06-19 18:32:20 +08:00			`f.write(result.extracted_content)`