
A slew of exciting updates to improve the crawler's stability and robustness! 🎉 - 💻 **UTF encoding fix**: Resolved the Windows \"charmap\" error by adding UTF encoding. - 🛡️ **Error handling**: Implemented MaxRetryError exception handling in LocalSeleniumCrawlerStrategy. - 🧹 **Input sanitization**: Improved input sanitization and handled encoding issues in LLMExtractionStrategy. - 🚮 **Database cleanup**: Removed existing database file and initialized a new one.
47 lines
1.7 KiB
Python
47 lines
1.7 KiB
Python
import os
|
|
import time
|
|
import json
|
|
from crawl4ai.web_crawler import WebCrawler
|
|
from crawl4ai.chunking_strategy import *
|
|
from crawl4ai.extraction_strategy import *
|
|
from crawl4ai.crawler_strategy import *
|
|
|
|
url = r'https://marketplace.visualstudio.com/items?itemName=Unclecode.groqopilot'
|
|
|
|
crawler = WebCrawler()
|
|
crawler.warmup()
|
|
|
|
from pydantic import BaseModel, Field
|
|
|
|
class PageSummary(BaseModel):
|
|
title: str = Field(..., description="Title of the page.")
|
|
summary: str = Field(..., description="Summary of the page.")
|
|
brief_summary: str = Field(..., description="Brief summary of the page.")
|
|
keywords: list = Field(..., description="Keywords assigned to the page.")
|
|
|
|
result = crawler.run(
|
|
url=url,
|
|
word_count_threshold=1,
|
|
extraction_strategy= LLMExtractionStrategy(
|
|
provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'),
|
|
schema=PageSummary.model_json_schema(),
|
|
extraction_type="schema",
|
|
apply_chunking =False,
|
|
instruction="From the crawled content, extract the following details: "\
|
|
"1. Title of the page "\
|
|
"2. Summary of the page, which is a detailed summary "\
|
|
"3. Brief summary of the page, which is a paragraph text "\
|
|
"4. Keywords assigned to the page, which is a list of keywords. "\
|
|
'The extracted JSON format should look like this: '\
|
|
'{ "title": "Page Title", "summary": "Detailed summary of the page.", "brief_summary": "Brief summary in a paragraph.", "keywords": ["keyword1", "keyword2", "keyword3"] }'
|
|
),
|
|
bypass_cache=True,
|
|
)
|
|
|
|
page_summary = json.loads(result.extracted_content)
|
|
|
|
print(page_summary)
|
|
|
|
with open(".data/page_summary.json", "w", encoding="utf-8") as f:
|
|
f.write(result.extracted_content)
|