
Moved ProxyConfig class from proxy_strategy.py to async_configs.py for better organization. Improved LLM token handling with new PROVIDER_MODELS_PREFIXES. Added test cases for deep crawling and proxy rotation. Removed docker_config from BrowserConfig as it's handled separately. BREAKING CHANGE: ProxyConfig import path changed from crawl4ai.proxy_strategy to crawl4ai
413 lines
15 KiB
Python
413 lines
15 KiB
Python
import asyncio
|
|
import os
|
|
import json
|
|
import base64
|
|
from pathlib import Path
|
|
from typing import List
|
|
from crawl4ai import ProxyConfig
|
|
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, CrawlResult
|
|
from crawl4ai import RoundRobinProxyStrategy
|
|
from crawl4ai import JsonCssExtractionStrategy, LLMExtractionStrategy
|
|
from crawl4ai import LLMConfig
|
|
from crawl4ai import PruningContentFilter, BM25ContentFilter
|
|
from crawl4ai import DefaultMarkdownGenerator
|
|
from crawl4ai import BFSDeepCrawlStrategy, DomainFilter, FilterChain
|
|
from crawl4ai import BrowserConfig
|
|
|
|
__cur_dir__ = Path(__file__).parent
|
|
|
|
async def demo_basic_crawl():
|
|
"""Basic web crawling with markdown generation"""
|
|
print("\n=== 1. Basic Web Crawling ===")
|
|
async with AsyncWebCrawler(config = BrowserConfig(
|
|
viewport_height=800,
|
|
viewport_width=1200,
|
|
headless=True,
|
|
verbose=True,
|
|
)) as crawler:
|
|
results: List[CrawlResult] = await crawler.arun(
|
|
url="https://news.ycombinator.com/"
|
|
)
|
|
|
|
for i, result in enumerate(results):
|
|
print(f"Result {i + 1}:")
|
|
print(f"Success: {result.success}")
|
|
if result.success:
|
|
print(f"Markdown length: {len(result.markdown.raw_markdown)} chars")
|
|
print(f"First 100 chars: {result.markdown.raw_markdown[:100]}...")
|
|
else:
|
|
print("Failed to crawl the URL")
|
|
|
|
async def demo_parallel_crawl():
|
|
"""Crawl multiple URLs in parallel"""
|
|
print("\n=== 2. Parallel Crawling ===")
|
|
|
|
urls = [
|
|
"https://news.ycombinator.com/",
|
|
"https://example.com/",
|
|
"https://httpbin.org/html",
|
|
]
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
results: List[CrawlResult] = await crawler.arun_many(
|
|
urls=urls,
|
|
)
|
|
|
|
print(f"Crawled {len(results)} URLs in parallel:")
|
|
for i, result in enumerate(results):
|
|
print(
|
|
f" {i + 1}. {result.url} - {'Success' if result.success else 'Failed'}"
|
|
)
|
|
|
|
async def demo_fit_markdown():
|
|
"""Generate focused markdown with LLM content filter"""
|
|
print("\n=== 3. Fit Markdown with LLM Content Filter ===")
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
result: CrawlResult = await crawler.arun(
|
|
url = "https://en.wikipedia.org/wiki/Python_(programming_language)",
|
|
config=CrawlerRunConfig(
|
|
markdown_generator=DefaultMarkdownGenerator(
|
|
content_filter=PruningContentFilter()
|
|
)
|
|
),
|
|
)
|
|
|
|
# Print stats and save the fit markdown
|
|
print(f"Raw: {len(result.markdown.raw_markdown)} chars")
|
|
print(f"Fit: {len(result.markdown.fit_markdown)} chars")
|
|
|
|
async def demo_llm_structured_extraction_no_schema():
|
|
# Create a simple LLM extraction strategy (no schema required)
|
|
extraction_strategy = LLMExtractionStrategy(
|
|
llm_config=LLMConfig(
|
|
provider="groq/qwen-2.5-32b",
|
|
api_token="env:GROQ_API_KEY",
|
|
),
|
|
instruction="This is news.ycombinator.com, extract all news, and for each, I want title, source url, number of comments.",
|
|
extract_type="schema",
|
|
schema="{title: string, url: string, comments: int}",
|
|
extra_args={
|
|
"temperature": 0.0,
|
|
"max_tokens": 4096,
|
|
},
|
|
verbose=True,
|
|
)
|
|
|
|
config = CrawlerRunConfig(extraction_strategy=extraction_strategy)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
results: List[CrawlResult] = await crawler.arun(
|
|
"https://news.ycombinator.com/", config=config
|
|
)
|
|
|
|
for result in results:
|
|
print(f"URL: {result.url}")
|
|
print(f"Success: {result.success}")
|
|
if result.success:
|
|
data = json.loads(result.extracted_content)
|
|
print(json.dumps(data, indent=2))
|
|
else:
|
|
print("Failed to extract structured data")
|
|
|
|
async def demo_css_structured_extraction_no_schema():
|
|
"""Extract structured data using CSS selectors"""
|
|
print("\n=== 5. CSS-Based Structured Extraction ===")
|
|
# Sample HTML for schema generation (one-time cost)
|
|
sample_html = """
|
|
<div class="body-post clear">
|
|
<a class="story-link" href="https://thehackernews.com/2025/04/malicious-python-packages-on-pypi.html">
|
|
<div class="clear home-post-box cf">
|
|
<div class="home-img clear">
|
|
<div class="img-ratio">
|
|
<img alt="..." src="...">
|
|
</div>
|
|
</div>
|
|
<div class="clear home-right">
|
|
<h2 class="home-title">Malicious Python Packages on PyPI Downloaded 39,000+ Times, Steal Sensitive Data</h2>
|
|
<div class="item-label">
|
|
<span class="h-datetime"><i class="icon-font icon-calendar"></i>Apr 05, 2025</span>
|
|
<span class="h-tags">Malware / Supply Chain Attack</span>
|
|
</div>
|
|
<div class="home-desc"> Cybersecurity researchers have...</div>
|
|
</div>
|
|
</div>
|
|
</a>
|
|
</div>
|
|
"""
|
|
|
|
# Check if schema file exists
|
|
schema_file_path = f"{__cur_dir__}/tmp/schema.json"
|
|
if os.path.exists(schema_file_path):
|
|
with open(schema_file_path, "r") as f:
|
|
schema = json.load(f)
|
|
else:
|
|
# Generate schema using LLM (one-time setup)
|
|
schema = JsonCssExtractionStrategy.generate_schema(
|
|
html=sample_html,
|
|
llm_config=LLMConfig(
|
|
provider="groq/qwen-2.5-32b",
|
|
api_token="env:GROQ_API_KEY",
|
|
),
|
|
query="From https://thehackernews.com/, I have shared a sample of one news div with a title, date, and description. Please generate a schema for this news div.",
|
|
)
|
|
|
|
print(f"Generated schema: {json.dumps(schema, indent=2)}")
|
|
# Save the schema to a file , and use it for future extractions, in result for such extraction you will call LLM once
|
|
with open(f"{__cur_dir__}/tmp/schema.json", "w") as f:
|
|
json.dump(schema, f, indent=2)
|
|
|
|
# Create no-LLM extraction strategy with the generated schema
|
|
extraction_strategy = JsonCssExtractionStrategy(schema)
|
|
config = CrawlerRunConfig(extraction_strategy=extraction_strategy)
|
|
|
|
# Use the fast CSS extraction (no LLM calls during extraction)
|
|
async with AsyncWebCrawler() as crawler:
|
|
results: List[CrawlResult] = await crawler.arun(
|
|
"https://thehackernews.com", config=config
|
|
)
|
|
|
|
for result in results:
|
|
print(f"URL: {result.url}")
|
|
print(f"Success: {result.success}")
|
|
if result.success:
|
|
data = json.loads(result.extracted_content)
|
|
print(json.dumps(data, indent=2))
|
|
else:
|
|
print("Failed to extract structured data")
|
|
|
|
async def demo_deep_crawl():
|
|
"""Deep crawling with BFS strategy"""
|
|
print("\n=== 6. Deep Crawling ===")
|
|
|
|
filter_chain = FilterChain([DomainFilter(allowed_domains=["crawl4ai.com"])])
|
|
|
|
deep_crawl_strategy = BFSDeepCrawlStrategy(
|
|
max_depth=1, max_pages=5, filter_chain=filter_chain
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
results: List[CrawlResult] = await crawler.arun(
|
|
url="https://docs.crawl4ai.com",
|
|
config=CrawlerRunConfig(deep_crawl_strategy=deep_crawl_strategy),
|
|
)
|
|
|
|
print(f"Deep crawl returned {len(results)} pages:")
|
|
for i, result in enumerate(results):
|
|
depth = result.metadata.get("depth", "unknown")
|
|
print(f" {i + 1}. {result.url} (Depth: {depth})")
|
|
|
|
async def demo_js_interaction():
|
|
"""Execute JavaScript to load more content"""
|
|
print("\n=== 7. JavaScript Interaction ===")
|
|
|
|
# A simple page that needs JS to reveal content
|
|
async with AsyncWebCrawler(config=BrowserConfig(headless=False)) as crawler:
|
|
# Initial load
|
|
|
|
news_schema = {
|
|
"name": "news",
|
|
"baseSelector": "tr.athing",
|
|
"fields": [
|
|
{
|
|
"name": "title",
|
|
"selector": "span.titleline",
|
|
"type": "text",
|
|
}
|
|
],
|
|
}
|
|
results: List[CrawlResult] = await crawler.arun(
|
|
url="https://news.ycombinator.com",
|
|
config=CrawlerRunConfig(
|
|
session_id="hn_session", # Keep session
|
|
extraction_strategy=JsonCssExtractionStrategy(schema=news_schema),
|
|
),
|
|
)
|
|
|
|
news = []
|
|
for result in results:
|
|
if result.success:
|
|
data = json.loads(result.extracted_content)
|
|
news.extend(data)
|
|
print(json.dumps(data, indent=2))
|
|
else:
|
|
print("Failed to extract structured data")
|
|
|
|
print(f"Initial items: {len(news)}")
|
|
|
|
# Click "More" link
|
|
more_config = CrawlerRunConfig(
|
|
js_code="document.querySelector('a.morelink').click();",
|
|
js_only=True, # Continue in same page
|
|
session_id="hn_session", # Keep session
|
|
extraction_strategy=JsonCssExtractionStrategy(
|
|
schema=news_schema,
|
|
),
|
|
)
|
|
|
|
result: List[CrawlResult] = await crawler.arun(
|
|
url="https://news.ycombinator.com", config=more_config
|
|
)
|
|
|
|
# Extract new items
|
|
for result in results:
|
|
if result.success:
|
|
data = json.loads(result.extracted_content)
|
|
news.extend(data)
|
|
print(json.dumps(data, indent=2))
|
|
else:
|
|
print("Failed to extract structured data")
|
|
print(f"Total items: {len(news)}")
|
|
|
|
async def demo_media_and_links():
|
|
"""Extract media and links from a page"""
|
|
print("\n=== 8. Media and Links Extraction ===")
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
result: List[CrawlResult] = await crawler.arun("https://en.wikipedia.org/wiki/Main_Page")
|
|
|
|
for i, result in enumerate(result):
|
|
# Extract and save all images
|
|
images = result.media.get("images", [])
|
|
print(f"Found {len(images)} images")
|
|
|
|
# Extract and save all links (internal and external)
|
|
internal_links = result.links.get("internal", [])
|
|
external_links = result.links.get("external", [])
|
|
print(f"Found {len(internal_links)} internal links")
|
|
print(f"Found {len(external_links)} external links")
|
|
|
|
# Print some of the images and links
|
|
for image in images[:3]:
|
|
print(f"Image: {image['src']}")
|
|
for link in internal_links[:3]:
|
|
print(f"Internal link: {link['href']}")
|
|
for link in external_links[:3]:
|
|
print(f"External link: {link['href']}")
|
|
|
|
# # Save everything to files
|
|
with open(f"{__cur_dir__}/tmp/images.json", "w") as f:
|
|
json.dump(images, f, indent=2)
|
|
|
|
with open(f"{__cur_dir__}/tmp/links.json", "w") as f:
|
|
json.dump(
|
|
{"internal": internal_links, "external": external_links},
|
|
f,
|
|
indent=2,
|
|
)
|
|
|
|
async def demo_screenshot_and_pdf():
|
|
"""Capture screenshot and PDF of a page"""
|
|
print("\n=== 9. Screenshot and PDF Capture ===")
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
result: List[CrawlResult] = await crawler.arun(
|
|
# url="https://example.com",
|
|
url="https://en.wikipedia.org/wiki/Giant_anteater",
|
|
config=CrawlerRunConfig(screenshot=True, pdf=True),
|
|
)
|
|
|
|
for i, result in enumerate(result):
|
|
# if result.screenshot_data:
|
|
if result.screenshot:
|
|
# Save screenshot
|
|
screenshot_path = f"{__cur_dir__}/tmp/example_screenshot.png"
|
|
with open(screenshot_path, "wb") as f:
|
|
f.write(base64.b64decode(result.screenshot))
|
|
print(f"Screenshot saved to {screenshot_path}")
|
|
|
|
# if result.pdf_data:
|
|
if result.pdf:
|
|
# Save PDF
|
|
pdf_path = f"{__cur_dir__}/tmp/example.pdf"
|
|
with open(pdf_path, "wb") as f:
|
|
f.write(result.pdf)
|
|
print(f"PDF saved to {pdf_path}")
|
|
|
|
async def demo_proxy_rotation():
|
|
"""Proxy rotation for multiple requests"""
|
|
print("\n=== 10. Proxy Rotation ===")
|
|
|
|
# Example proxies (replace with real ones)
|
|
proxies = [
|
|
ProxyConfig(server="http://proxy1.example.com:8080"),
|
|
ProxyConfig(server="http://proxy2.example.com:8080"),
|
|
]
|
|
|
|
proxy_strategy = RoundRobinProxyStrategy(proxies)
|
|
|
|
print(f"Using {len(proxies)} proxies in rotation")
|
|
print(
|
|
"Note: This example uses placeholder proxies - replace with real ones to test"
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
config = CrawlerRunConfig(
|
|
proxy_rotation_strategy=proxy_strategy
|
|
)
|
|
|
|
# In a real scenario, these would be run and the proxies would rotate
|
|
print("In a real scenario, requests would rotate through the available proxies")
|
|
|
|
async def demo_raw_html_and_file():
|
|
"""Process raw HTML and local files"""
|
|
print("\n=== 11. Raw HTML and Local Files ===")
|
|
|
|
raw_html = """
|
|
<html><body>
|
|
<h1>Sample Article</h1>
|
|
<p>This is sample content for testing Crawl4AI's raw HTML processing.</p>
|
|
</body></html>
|
|
"""
|
|
|
|
# Save to file
|
|
file_path = Path("docs/examples/tmp/sample.html").absolute()
|
|
with open(file_path, "w") as f:
|
|
f.write(raw_html)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
# Crawl raw HTML
|
|
raw_result = await crawler.arun(
|
|
url="raw:" + raw_html, config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
|
)
|
|
print("Raw HTML processing:")
|
|
print(f" Markdown: {raw_result.markdown.raw_markdown[:50]}...")
|
|
|
|
# Crawl local file
|
|
file_result = await crawler.arun(
|
|
url=f"file://{file_path}",
|
|
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
|
|
)
|
|
print("\nLocal file processing:")
|
|
print(f" Markdown: {file_result.markdown.raw_markdown[:50]}...")
|
|
|
|
# Clean up
|
|
os.remove(file_path)
|
|
print(f"Processed both raw HTML and local file ({file_path})")
|
|
|
|
async def main():
|
|
"""Run all demo functions sequentially"""
|
|
print("=== Comprehensive Crawl4AI Demo ===")
|
|
print("Note: Some examples require API keys or other configurations")
|
|
|
|
# Run all demos
|
|
await demo_basic_crawl()
|
|
await demo_parallel_crawl()
|
|
await demo_fit_markdown()
|
|
await demo_llm_structured_extraction_no_schema()
|
|
await demo_css_structured_extraction_no_schema()
|
|
await demo_deep_crawl()
|
|
await demo_js_interaction()
|
|
await demo_media_and_links()
|
|
await demo_screenshot_and_pdf()
|
|
# # await demo_proxy_rotation()
|
|
await demo_raw_html_and_file()
|
|
|
|
# Clean up any temp files that may have been created
|
|
print("\n=== Demo Complete ===")
|
|
print("Check for any generated files (screenshots, PDFs) in the current directory")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|