
Add new features to enhance browser automation and HTML extraction: - Add CDP browser launch capability with customizable ports and profiles - Implement JsonLxmlExtractionStrategy for faster HTML parsing - Add CLI command 'crwl cdp' for launching standalone CDP browsers - Support connecting to external CDP browsers via URL - Optimize selector caching and context-sensitive queries BREAKING CHANGE: LLMConfig import path changed from crawl4ai.types to crawl4ai
44 lines
1.7 KiB
Python
44 lines
1.7 KiB
Python
from crawl4ai import LLMConfig
|
|
from crawl4ai import AsyncWebCrawler, LLMExtractionStrategy
|
|
import asyncio
|
|
import os
|
|
import json
|
|
from pydantic import BaseModel, Field
|
|
|
|
url = "https://openai.com/api/pricing/"
|
|
|
|
|
|
class OpenAIModelFee(BaseModel):
|
|
model_name: str = Field(..., description="Name of the OpenAI model.")
|
|
input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
|
|
output_fee: str = Field(
|
|
..., description="Fee for output token for the OpenAI model."
|
|
)
|
|
|
|
async def main():
|
|
# Use AsyncWebCrawler
|
|
async with AsyncWebCrawler() as crawler:
|
|
result = await crawler.arun(
|
|
url=url,
|
|
word_count_threshold=1,
|
|
extraction_strategy=LLMExtractionStrategy(
|
|
# provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'),
|
|
llm_config=LLMConfig(provider="groq/llama-3.1-70b-versatile", api_token=os.getenv("GROQ_API_KEY")),
|
|
schema=OpenAIModelFee.model_json_schema(),
|
|
extraction_type="schema",
|
|
instruction="From the crawled content, extract all mentioned model names along with their "
|
|
"fees for input and output tokens. Make sure not to miss anything in the entire content. "
|
|
"One extracted model JSON format should look like this: "
|
|
'{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }',
|
|
),
|
|
)
|
|
print("Success:", result.success)
|
|
model_fees = json.loads(result.extracted_content)
|
|
print(len(model_fees))
|
|
|
|
with open(".data/data.json", "w", encoding="utf-8") as f:
|
|
f.write(result.extracted_content)
|
|
|
|
|
|
asyncio.run(main())
|