crawl4ai/docs/examples/proxy_rotation_demo.py
UncleCode 19df96ed56 feat(proxy): add proxy rotation strategy
Implements a new proxy rotation system with the following changes:
- Add ProxyRotationStrategy abstract base class
- Add RoundRobinProxyStrategy concrete implementation
- Integrate proxy rotation with AsyncWebCrawler
- Add proxy_rotation_strategy parameter to CrawlerRunConfig
- Add example script demonstrating proxy rotation usage
- Remove deprecated synchronous WebCrawler code
- Clean up rate limiting documentation

BREAKING CHANGE: Removed synchronous WebCrawler support and related rate limiting configurations
2025-02-09 18:49:10 +08:00

162 lines
5.9 KiB
Python

import os
import re
from typing import List, Dict
from crawl4ai import (
AsyncWebCrawler,
BrowserConfig,
CrawlerRunConfig,
CacheMode,
RoundRobinProxyStrategy
)
def load_proxies_from_env() -> List[Dict]:
"""Load proxies from PROXIES environment variable"""
proxies = []
try:
proxy_list = os.getenv("PROXIES", "").split(",")
for proxy in proxy_list:
if not proxy:
continue
ip, port, username, password = proxy.split(":")
proxies.append({
"server": f"http://{ip}:{port}",
"username": username,
"password": password,
"ip": ip # Store original IP for verification
})
except Exception as e:
print(f"Error loading proxies from environment: {e}")
return proxies
async def demo_proxy_rotation():
"""
Proxy Rotation Demo using RoundRobinProxyStrategy
===============================================
Demonstrates proxy rotation using the strategy pattern.
"""
print("\n=== Proxy Rotation Demo (Round Robin) ===")
# Load proxies and create rotation strategy
proxies = load_proxies_from_env()
if not proxies:
print("No proxies found in environment. Set PROXIES env variable!")
return
proxy_strategy = RoundRobinProxyStrategy(proxies)
# Create configs
browser_config = BrowserConfig(headless=True, verbose=False)
run_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
proxy_rotation_strategy=proxy_strategy
)
# Test URLs
urls = ["https://httpbin.org/ip"] * len(proxies) # Test each proxy once
async with AsyncWebCrawler(config=browser_config) as crawler:
for url in urls:
result = await crawler.arun(url=url, config=run_config)
if result.success:
# Extract IP from response
ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
current_proxy = run_config.proxy_config if run_config.proxy_config else None
if current_proxy:
print(f"Proxy {current_proxy['server']} -> Response IP: {ip_match.group(0) if ip_match else 'Not found'}")
verified = ip_match and ip_match.group(0) == current_proxy['ip']
if verified:
print(f"✅ Proxy working! IP matches: {current_proxy['ip']}")
else:
print("❌ Proxy failed or IP mismatch!")
else:
print(f"Request failed: {result.error_message}")
async def demo_proxy_rotation_batch():
"""
Proxy Rotation Demo with Batch Processing
=======================================
Demonstrates proxy rotation using arun_many with memory dispatcher.
"""
print("\n=== Proxy Rotation Batch Demo ===")
try:
# Load proxies and create rotation strategy
proxies = load_proxies_from_env()
if not proxies:
print("No proxies found in environment. Set PROXIES env variable!")
return
proxy_strategy = RoundRobinProxyStrategy(proxies)
# Configurations
browser_config = BrowserConfig(headless=True, verbose=False)
run_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
proxy_rotation_strategy=proxy_strategy,
markdown_generator=DefaultMarkdownGenerator()
)
# Test URLs - multiple requests to test rotation
urls = ["https://httpbin.org/ip"] * (len(proxies) * 2) # Test each proxy twice
print("\n📈 Initializing crawler with proxy rotation...")
async with AsyncWebCrawler(config=browser_config) as crawler:
monitor = CrawlerMonitor(
max_visible_rows=10,
display_mode=DisplayMode.DETAILED
)
dispatcher = MemoryAdaptiveDispatcher(
memory_threshold_percent=80.0,
check_interval=0.5,
max_session_permit=1, #len(proxies), # Match concurrent sessions to proxy count
# monitor=monitor
)
print("\n🚀 Starting batch crawl with proxy rotation...")
results = await crawler.arun_many(
urls=urls,
config=run_config,
dispatcher=dispatcher
)
# Verify results
success_count = 0
for result in results:
if result.success:
ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
current_proxy = run_config.proxy_config if run_config.proxy_config else None
if current_proxy and ip_match:
print(f"URL {result.url}")
print(f"Proxy {current_proxy['server']} -> Response IP: {ip_match.group(0)}")
verified = ip_match.group(0) == current_proxy['ip']
if verified:
print(f"✅ Proxy working! IP matches: {current_proxy['ip']}")
success_count += 1
else:
print("❌ Proxy failed or IP mismatch!")
print("---")
print(f"\n✅ Completed {len(results)} requests with {success_count} successful proxy verifications")
except Exception as e:
print(f"\n❌ Error in proxy rotation batch demo: {str(e)}")
if __name__ == "__main__":
import asyncio
from crawl4ai import (
CrawlerMonitor,
DisplayMode,
MemoryAdaptiveDispatcher,
DefaultMarkdownGenerator
)
async def run_demos():
# await demo_proxy_rotation() # Original single-request demo
await demo_proxy_rotation_batch() # New batch processing demo
asyncio.run(run_demos())