crawl4ai/docs/examples/tutorial_v0.5.py
UncleCode 230f22da86 refactor(proxy): move ProxyConfig to async_configs and improve LLM token handling
Moved ProxyConfig class from proxy_strategy.py to async_configs.py for better organization.
Improved LLM token handling with new PROVIDER_MODELS_PREFIXES.
Added test cases for deep crawling and proxy rotation.
Removed docker_config from BrowserConfig as it's handled separately.

BREAKING CHANGE: ProxyConfig import path changed from crawl4ai.proxy_strategy to crawl4ai
2025-04-15 22:27:18 +08:00

460 lines
17 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import asyncio
import time
import re
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, BrowserConfig, MemoryAdaptiveDispatcher, HTTPCrawlerConfig
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
from crawl4ai.deep_crawling import (
BestFirstCrawlingStrategy,
FilterChain,
URLPatternFilter,
DomainFilter,
ContentTypeFilter,
)
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy
from crawl4ai import ProxyConfig
from crawl4ai import RoundRobinProxyStrategy
from crawl4ai.content_filter_strategy import LLMContentFilter
from crawl4ai import DefaultMarkdownGenerator
from crawl4ai import LLMConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy
from pprint import pprint
# 1⃣ Deep Crawling with Best-First Strategy
async def deep_crawl():
"""
PART 1: Deep Crawling with Best-First Strategy
This function demonstrates:
- Using the BestFirstCrawlingStrategy
- Creating filter chains to narrow down crawl targets
- Using a scorer to prioritize certain URLs
- Respecting robots.txt rules
"""
print("\n===== DEEP CRAWLING =====")
print("This example shows how to implement deep crawling with filters, scorers, and robots.txt compliance.")
# Create a filter chain to filter urls based on patterns, domains and content type
filter_chain = FilterChain(
[
DomainFilter(
allowed_domains=["docs.crawl4ai.com"],
blocked_domains=["old.docs.crawl4ai.com"],
),
URLPatternFilter(patterns=["*core*", "*advanced*"],),
ContentTypeFilter(allowed_types=["text/html"]),
]
)
# Create a keyword scorer that prioritises the pages with certain keywords first
keyword_scorer = KeywordRelevanceScorer(
keywords=["crawl", "example", "async", "configuration"], weight=0.7
)
# Set up the configuration with robots.txt compliance enabled
deep_crawl_config = CrawlerRunConfig(
deep_crawl_strategy=BestFirstCrawlingStrategy(
max_depth=2,
include_external=False,
filter_chain=filter_chain,
url_scorer=keyword_scorer,
),
scraping_strategy=LXMLWebScrapingStrategy(),
stream=True,
verbose=True,
check_robots_txt=True, # Enable robots.txt compliance
)
# Execute the crawl
async with AsyncWebCrawler() as crawler:
print("\n📊 Starting deep crawl with Best-First strategy...")
print(" - Filtering by domain, URL patterns, and content type")
print(" - Scoring pages based on keyword relevance")
print(" - Respecting robots.txt rules")
start_time = time.perf_counter()
results = []
async for result in await crawler.arun(url="https://docs.crawl4ai.com", config=deep_crawl_config):
# Print each result as it comes in
depth = result.metadata.get("depth", 0)
score = result.metadata.get("score", 0)
print(f"Crawled: {result.url} (Depth: {depth}), score: {score:.2f}")
results.append(result)
duration = time.perf_counter() - start_time
# Print summary statistics
print(f"\n✅ Crawled {len(results)} high-value pages in {duration:.2f} seconds")
# Group by depth
if results:
depth_counts = {}
for result in results:
depth = result.metadata.get("depth", 0)
depth_counts[depth] = depth_counts.get(depth, 0) + 1
print("\n📊 Pages crawled by depth:")
for depth, count in sorted(depth_counts.items()):
print(f" Depth {depth}: {count} pages")
# 2⃣ Memory-Adaptive Dispatcher
async def memory_adaptive_dispatcher():
"""
PART 2: Memory-Adaptive Dispatcher
This function demonstrates:
- Using MemoryAdaptiveDispatcher to manage system memory
- Batch and streaming modes with multiple URLs
"""
print("\n===== MEMORY-ADAPTIVE DISPATCHER =====")
print("This example shows how to use the memory-adaptive dispatcher for resource management.")
# Configure the dispatcher (optional, defaults are used if not provided)
dispatcher = MemoryAdaptiveDispatcher(
memory_threshold_percent=80.0, # Pause if memory usage exceeds 80%
check_interval=0.5, # Check memory every 0.5 seconds
)
# Test URLs
urls = [
"https://docs.crawl4ai.com",
"https://github.com/unclecode/crawl4ai"
]
async def batch_mode():
print("\n📊 BATCH MODE:")
print(" In this mode, all results are collected before being returned.")
async with AsyncWebCrawler() as crawler:
start_time = time.perf_counter()
results = await crawler.arun_many(
urls=urls,
config=CrawlerRunConfig(stream=False), # Batch mode
dispatcher=dispatcher,
)
print(f" ✅ Received all {len(results)} results after {time.perf_counter() - start_time:.2f} seconds")
for result in results:
print(f"{result.url} with status code: {result.status_code}")
async def stream_mode():
print("\n📊 STREAMING MODE:")
print(" In this mode, results are processed as they become available.")
async with AsyncWebCrawler() as crawler:
start_time = time.perf_counter()
count = 0
first_result_time = None
async for result in await crawler.arun_many(
urls=urls,
config=CrawlerRunConfig(stream=True), # Stream mode
dispatcher=dispatcher,
):
count += 1
current_time = time.perf_counter() - start_time
if count == 1:
first_result_time = current_time
print(f" ✅ First result after {first_result_time:.2f} seconds: {result.url}")
else:
print(f" → Result #{count} after {current_time:.2f} seconds: {result.url}")
print(f" ✅ Total: {count} results")
print(f" ✅ First result: {first_result_time:.2f} seconds")
print(f" ✅ All results: {time.perf_counter() - start_time:.2f} seconds")
# Run both examples
await batch_mode()
await stream_mode()
print("\n🔍 Key Takeaway: The memory-adaptive dispatcher prevents OOM errors")
print(" and manages concurrency based on system resources.")
# 3⃣ HTTP Crawler Strategy
async def http_crawler_strategy():
"""
PART 3: HTTP Crawler Strategy
This function demonstrates:
- Using the lightweight HTTP-only crawler
- Setting custom headers and configurations
"""
print("\n===== HTTP CRAWLER STRATEGY =====")
print("This example shows how to use the fast, lightweight HTTP-only crawler.")
# Use the HTTP crawler strategy
http_config = HTTPCrawlerConfig(
method="GET",
headers={"User-Agent": "MyCustomBot/1.0"},
follow_redirects=True,
verify_ssl=True
)
print("\n📊 Initializing HTTP crawler strategy...")
print(" - Using custom User-Agent: MyCustomBot/1.0")
print(" - Following redirects: Enabled")
print(" - Verifying SSL: Enabled")
# Create crawler with HTTP strategy
async with AsyncWebCrawler(
crawler_strategy=AsyncHTTPCrawlerStrategy(browser_config=http_config)
) as crawler:
start_time = time.perf_counter()
result = await crawler.arun("https://example.com")
duration = time.perf_counter() - start_time
print(f"\n✅ Crawled in {duration:.2f} seconds")
print(f"✅ Status code: {result.status_code}")
print(f"✅ Content length: {len(result.html)} bytes")
# Check if there was a redirect
if result.redirected_url and result.redirected_url != result.url:
print(f" Redirected from {result.url} to {result.redirected_url}")
print("\n🔍 Key Takeaway: HTTP crawler is faster and more memory-efficient")
print(" than browser-based crawling for simple pages.")
# 4⃣ Proxy Rotation
async def proxy_rotation():
"""
PART 4: Proxy Rotation
This function demonstrates:
- Setting up a proxy rotation strategy
- Using multiple proxies in a round-robin fashion
"""
print("\n===== PROXY ROTATION =====")
print("This example shows how to implement proxy rotation for distributed crawling.")
# Load proxies and create rotation strategy
proxies = ProxyConfig.from_env()
#eg: export PROXIES="ip1:port1:username1:password1,ip2:port2:username2:password2"
if not proxies:
print("No proxies found in environment. Set PROXIES env variable!")
return
proxy_strategy = RoundRobinProxyStrategy(proxies)
# Create configs
browser_config = BrowserConfig(headless=True, verbose=False)
run_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
proxy_rotation_strategy=proxy_strategy
)
async with AsyncWebCrawler(config=browser_config) as crawler:
urls = ["https://httpbin.org/ip"] * (len(proxies) * 2) # Test each proxy twice
print("\n📈 Initializing crawler with proxy rotation...")
async with AsyncWebCrawler(config=browser_config) as crawler:
print("\n🚀 Starting batch crawl with proxy rotation...")
results = await crawler.arun_many(
urls=urls,
config=run_config
)
for result in results:
if result.success:
ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
current_proxy = run_config.proxy_config if run_config.proxy_config else None
if current_proxy and ip_match:
print(f"URL {result.url}")
print(f"Proxy {current_proxy.server} -> Response IP: {ip_match.group(0)}")
verified = ip_match.group(0) == current_proxy.ip
if verified:
print(f"✅ Proxy working! IP matches: {current_proxy.ip}")
else:
print("❌ Proxy failed or IP mismatch!")
print("---")
else:
print(f"❌ Crawl via proxy failed!: {result.error_message}")
# 5⃣ LLM Content Filter (requires API key)
async def llm_content_filter():
"""
PART 5: LLM Content Filter
This function demonstrates:
- Configuring LLM providers via LLMConfig
- Using LLM to generate focused markdown
- LLMConfig for configuration
Note: Requires a valid API key for the chosen LLM provider
"""
print("\n===== LLM CONTENT FILTER =====")
print("This example shows how to use LLM to generate focused markdown content.")
print("Note: This example requires an API key. Set it in environment variables.")
# Create LLM configuration
# Replace with your actual API key or set as environment variable
llm_config = LLMConfig(
provider="gemini/gemini-1.5-pro",
api_token="env:GEMINI_API_KEY" # Will read from GEMINI_API_KEY environment variable
)
print("\n📊 Setting up LLM content filter...")
print(f" - Provider: {llm_config.provider}")
print(" - API token: Using environment variable")
print(" - Instruction: Extract key concepts and summaries")
# Create markdown generator with LLM filter
markdown_generator = DefaultMarkdownGenerator(
content_filter=LLMContentFilter(
llm_config=llm_config,
instruction="Extract key concepts and summaries"
)
)
config = CrawlerRunConfig(markdown_generator=markdown_generator)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun("https://docs.crawl4ai.com", config=config)
pprint(result.markdown.fit_markdown)
print("\n✅ Generated focused markdown:")
# 6⃣ PDF Processing
async def pdf_processing():
"""
PART 6: PDF Processing
This function demonstrates:
- Using PDFCrawlerStrategy and PDFContentScrapingStrategy
- Extracting text and metadata from PDFs
"""
print("\n===== PDF PROCESSING =====")
print("This example shows how to extract text and metadata from PDF files.")
# Sample PDF URL
pdf_url = "https://arxiv.org/pdf/2310.06825.pdf"
print("\n📊 Initializing PDF crawler...")
print(f" - Target PDF: {pdf_url}")
print(" - Using PDFCrawlerStrategy and PDFContentScrapingStrategy")
# Create crawler with PDF strategy
async with AsyncWebCrawler(crawler_strategy=PDFCrawlerStrategy()) as crawler:
print("\n🚀 Starting PDF processing...")
start_time = time.perf_counter()
result = await crawler.arun(
pdf_url,
config=CrawlerRunConfig(scraping_strategy=PDFContentScrapingStrategy())
)
duration = time.perf_counter() - start_time
print(f"\n✅ Processed PDF in {duration:.2f} seconds")
# Show metadata
print("\n📄 PDF Metadata:")
if result.metadata:
for key, value in result.metadata.items():
if key not in ["html", "text", "markdown"] and value:
print(f" - {key}: {value}")
else:
print(" No metadata available")
# Show sample of content
if result.markdown:
print("\n📝 PDF Content Sample:")
content_sample = result.markdown[:500] + "..." if len(result.markdown) > 500 else result.markdown
print(f"---\n{content_sample}\n---")
else:
print("\n⚠️ No content extracted")
print("\n🔍 Key Takeaway: Crawl4AI can now process PDF files")
print(" to extract both text content and metadata.")
# 7⃣ LLM Schema Generation (requires API key)
async def llm_schema_generation():
"""
PART 7: LLM Schema Generation
This function demonstrates:
- Configuring LLM providers via LLMConfig
- Using LLM to generate extraction schemas
- JsonCssExtractionStrategy
Note: Requires a valid API key for the chosen LLM provider
"""
print("\n===== LLM SCHEMA GENERATION =====")
print("This example shows how to use LLM to automatically generate extraction schemas.")
print("Note: This example requires an API key. Set it in environment variables.")
# Sample HTML
sample_html = """
<div class="product">
<h2 class="title">Awesome Gaming Laptop</h2>
<div class="price">$1,299.99</div>
<div class="specs">
<ul>
<li>16GB RAM</li>
<li>512GB SSD</li>
<li>RTX 3080</li>
</ul>
</div>
<div class="rating">4.7/5</div>
</div>
"""
print("\n📊 Setting up LLMConfig...")
# Create LLM configuration
llm_config = LLMConfig(
provider="gemini/gemini-1.5-pro",
api_token="env:GEMINI_API_KEY"
)
print("\n🚀 Generating schema for product extraction...")
print(" This would use the LLM to analyze HTML and create an extraction schema")
schema = JsonCssExtractionStrategy.generate_schema(
html=sample_html,
llm_config = llm_config,
query="Extract product name and price"
)
print("\n✅ Generated Schema:")
pprint(schema)
# Run all sections
async def run_tutorial():
"""
Main function to run all tutorial sections.
"""
print("\n🚀 CRAWL4AI v0.5.0 TUTORIAL 🚀")
print("===============================")
print("This tutorial demonstrates the key features of Crawl4AI v0.5.0")
print("Including deep crawling, memory-adaptive dispatching, advanced filtering,")
print("and more powerful extraction capabilities.")
# Sections to run
sections = [
deep_crawl, # 1. Deep Crawling with Best-First Strategy
memory_adaptive_dispatcher, # 2. Memory-Adaptive Dispatcher
http_crawler_strategy, # 3. HTTP Crawler Strategy
proxy_rotation, # 4. Proxy Rotation
llm_content_filter, # 5. LLM Content Filter
pdf_processing, # 6. PDF Processing
llm_schema_generation, # 7. Schema Generation using LLM
]
for section in sections:
try:
await section()
except Exception as e:
print(f"⚠️ Error in {section.__name__}: {e}")
print("\n🎉 TUTORIAL COMPLETE! 🎉")
print("You've now explored the key features of Crawl4AI v0.5.0")
print("For more information, visit https://docs.crawl4ai.com")
# Run the tutorial
if __name__ == "__main__":
asyncio.run(run_tutorial())