crawl4ai/docs/examples/summarize_page.py

import os
import time
import json
from crawl4ai.web_crawler import WebCrawler
from crawl4ai.chunking_strategy import *
from crawl4ai.extraction_strategy import *
from crawl4ai.crawler_strategy import *

url = r'https://marketplace.visualstudio.com/items?itemName=Unclecode.groqopilot'

crawler = WebCrawler()
crawler.warmup()

from pydantic import BaseModel, Field

class PageSummary(BaseModel):
    title: str = Field(..., description="Title of the page.")
    summary: str = Field(..., description="Summary of the page.")
    brief_summary: str = Field(..., description="Brief summary of the page.")
    keywords: list = Field(..., description="Keywords assigned to the page.")

result = crawler.run(
    url=url,
    word_count_threshold=1,
    extraction_strategy= LLMExtractionStrategy(
        provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'), 
        schema=PageSummary.model_json_schema(),
        extraction_type="schema",
        apply_chunking =False,
        instruction="From the crawled content, extract the following details: "\
            "1. Title of the page "\
            "2. Summary of the page, which is a detailed summary "\
            "3. Brief summary of the page, which is a paragraph text "\
            "4. Keywords assigned to the page, which is a list of keywords. "\
            'The extracted JSON format should look like this: '\
            '{ "title": "Page Title", "summary": "Detailed summary of the page.", "brief_summary": "Brief summary in a paragraph.", "keywords": ["keyword1", "keyword2", "keyword3"] }'
    ),
    bypass_cache=True,
)

page_summary = json.loads(result.extracted_content)

print(page_summary)

with open(".data/page_summary.json", "w") as f:
    f.write(result.extracted_content)
Update LLMExtractionStrategy to disable chunking if specified, Add example of summarization for a web page. 2024-06-19 19:03:35 +08:00			`import os`
			`import time`
			`import json`
			`from crawl4ai.web_crawler import WebCrawler`
			`from crawl4ai.chunking_strategy import *`
			`from crawl4ai.extraction_strategy import *`
			`from crawl4ai.crawler_strategy import *`

			`url = r'https://marketplace.visualstudio.com/items?itemName=Unclecode.groqopilot'`

			`crawler = WebCrawler()`
			`crawler.warmup()`

			`from pydantic import BaseModel, Field`

			`class PageSummary(BaseModel):`
			`title: str = Field(..., description="Title of the page.")`
			`summary: str = Field(..., description="Summary of the page.")`
			`brief_summary: str = Field(..., description="Brief summary of the page.")`
			`keywords: list = Field(..., description="Keywords assigned to the page.")`

			`result = crawler.run(`
			`url=url,`
			`word_count_threshold=1,`
			`extraction_strategy= LLMExtractionStrategy(`
			`provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'),`
			`schema=PageSummary.model_json_schema(),`
			`extraction_type="schema",`
			`apply_chunking =False,`
			`instruction="From the crawled content, extract the following details: "\`
			`"1. Title of the page "\`
			`"2. Summary of the page, which is a detailed summary "\`
			`"3. Brief summary of the page, which is a paragraph text "\`
			`"4. Keywords assigned to the page, which is a list of keywords. "\`
			`'The extracted JSON format should look like this: '\`
			`'{ "title": "Page Title", "summary": "Detailed summary of the page.", "brief_summary": "Brief summary in a paragraph.", "keywords": ["keyword1", "keyword2", "keyword3"] }'`
			`),`
			`bypass_cache=True,`
			`)`

			`page_summary = json.loads(result.extracted_content)`

			`print(page_summary)`

			`with open(".data/page_summary.json", "w") as f:`
			`f.write(result.extracted_content)`