
Replace float('inf') and float('-inf') with math.inf and -math.inf from the math module for better readability and performance. Also clean up imports and remove unused speed comparison code. No breaking changes.
44 lines
1.7 KiB
Python
44 lines
1.7 KiB
Python
from crawl4ai.async_configs import LlmConfig
|
|
from crawl4ai import AsyncWebCrawler, LLMExtractionStrategy
|
|
import asyncio
|
|
import os
|
|
import json
|
|
from pydantic import BaseModel, Field
|
|
|
|
url = "https://openai.com/api/pricing/"
|
|
|
|
|
|
class OpenAIModelFee(BaseModel):
|
|
model_name: str = Field(..., description="Name of the OpenAI model.")
|
|
input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
|
|
output_fee: str = Field(
|
|
..., description="Fee for output token for the OpenAI model."
|
|
)
|
|
|
|
async def main():
|
|
# Use AsyncWebCrawler
|
|
async with AsyncWebCrawler() as crawler:
|
|
result = await crawler.arun(
|
|
url=url,
|
|
word_count_threshold=1,
|
|
extraction_strategy=LLMExtractionStrategy(
|
|
# provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'),
|
|
llmConfig=LlmConfig(provider="groq/llama-3.1-70b-versatile", api_token=os.getenv("GROQ_API_KEY")),
|
|
schema=OpenAIModelFee.model_json_schema(),
|
|
extraction_type="schema",
|
|
instruction="From the crawled content, extract all mentioned model names along with their "
|
|
"fees for input and output tokens. Make sure not to miss anything in the entire content. "
|
|
"One extracted model JSON format should look like this: "
|
|
'{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }',
|
|
),
|
|
)
|
|
print("Success:", result.success)
|
|
model_fees = json.loads(result.extracted_content)
|
|
print(len(model_fees))
|
|
|
|
with open(".data/data.json", "w", encoding="utf-8") as f:
|
|
f.write(result.extracted_content)
|
|
|
|
|
|
asyncio.run(main())
|