crawl4ai/docs/examples/summarize_page.py

import os
import json
from crawl4ai.web_crawler import WebCrawler
from crawl4ai.chunking_strategy import *
from crawl4ai.extraction_strategy import *
from crawl4ai.crawler_strategy import *

url = r"https://marketplace.visualstudio.com/items?itemName=Unclecode.groqopilot"

crawler = WebCrawler()
crawler.warmup()

from pydantic import BaseModel, Field


class PageSummary(BaseModel):
    title: str = Field(..., description="Title of the page.")
    summary: str = Field(..., description="Summary of the page.")
    brief_summary: str = Field(..., description="Brief summary of the page.")
    keywords: list = Field(..., description="Keywords assigned to the page.")


result = crawler.run(
    url=url,
    word_count_threshold=1,
    extraction_strategy=LLMExtractionStrategy(
        provider="openai/gpt-4o",
        api_token=os.getenv("OPENAI_API_KEY"),
        schema=PageSummary.model_json_schema(),
        extraction_type="schema",
        apply_chunking=False,
        instruction="From the crawled content, extract the following details: "
        "1. Title of the page "
        "2. Summary of the page, which is a detailed summary "
        "3. Brief summary of the page, which is a paragraph text "
        "4. Keywords assigned to the page, which is a list of keywords. "
        "The extracted JSON format should look like this: "
        '{ "title": "Page Title", "summary": "Detailed summary of the page.", "brief_summary": "Brief summary in a paragraph.", "keywords": ["keyword1", "keyword2", "keyword3"] }',
    ),
    bypass_cache=True,
)

page_summary = json.loads(result.extracted_content)

print(page_summary)

with open(".data/page_summary.json", "w", encoding="utf-8") as f:
    f.write(result.extracted_content)