feat(proxy): add proxy rotation strategy

Implements a new proxy rotation system with the following changes:
- Add ProxyRotationStrategy abstract base class
- Add RoundRobinProxyStrategy concrete implementation
- Integrate proxy rotation with AsyncWebCrawler
- Add proxy_rotation_strategy parameter to CrawlerRunConfig
- Add example script demonstrating proxy rotation usage
- Remove deprecated synchronous WebCrawler code
- Clean up rate limiting documentation

BREAKING CHANGE: Removed synchronous WebCrawler support and related rate limiting configurations
This commit is contained in:
UncleCode 2025-02-09 18:49:10 +08:00
parent b957ff2ecd
commit 19df96ed56
12 changed files with 257 additions and 162 deletions

View File

@ -8,6 +8,10 @@ from .content_scraping_strategy import (
WebScrapingStrategy,
LXMLWebScrapingStrategy,
)
from .proxy_strategy import (
ProxyRotationStrategy,
RoundRobinProxyStrategy,
)
from .extraction_strategy import (
ExtractionStrategy,
LLMExtractionStrategy,
@ -60,31 +64,33 @@ __all__ = [
"DisplayMode",
"MarkdownGenerationResult",
"Crawl4aiDockerClient",
"ProxyRotationStrategy",
"RoundRobinProxyStrategy",
]
def is_sync_version_installed():
try:
import selenium # noqa
# def is_sync_version_installed():
# try:
# import selenium # noqa
return True
except ImportError:
return False
# return True
# except ImportError:
# return False
if is_sync_version_installed():
try:
from .web_crawler import WebCrawler
# if is_sync_version_installed():
# try:
# from .web_crawler import WebCrawler
__all__.append("WebCrawler")
except ImportError:
print(
"Warning: Failed to import WebCrawler even though selenium is installed. This might be due to other missing dependencies."
)
else:
WebCrawler = None
# import warnings
# print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.")
# __all__.append("WebCrawler")
# except ImportError:
# print(
# "Warning: Failed to import WebCrawler even though selenium is installed. This might be due to other missing dependencies."
# )
# else:
# WebCrawler = None
# # import warnings
# # print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.")
# Disable all Pydantic warnings
warnings.filterwarnings("ignore", module="pydantic")

View File

@ -16,6 +16,7 @@ from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrat
from .deep_crawling import DeepCrawlStrategy
from typing import Union, List
from .cache_context import CacheMode
from .proxy_strategy import ProxyRotationStrategy
import inspect
from typing import Any, Dict, Optional
@ -542,6 +543,7 @@ class CrawlerRunConfig():
parser_type: str = "lxml",
scraping_strategy: ContentScrapingStrategy = None,
proxy_config: dict = None,
proxy_rotation_strategy: Optional[ProxyRotationStrategy] = None,
# SSL Parameters
fetch_ssl_certificate: bool = False,
# Caching Parameters
@ -620,6 +622,7 @@ class CrawlerRunConfig():
self.parser_type = parser_type
self.scraping_strategy = scraping_strategy or WebScrapingStrategy()
self.proxy_config = proxy_config
self.proxy_rotation_strategy = proxy_rotation_strategy
# SSL Parameters
self.fetch_ssl_certificate = fetch_ssl_certificate
@ -731,6 +734,7 @@ class CrawlerRunConfig():
parser_type=kwargs.get("parser_type", "lxml"),
scraping_strategy=kwargs.get("scraping_strategy"),
proxy_config=kwargs.get("proxy_config"),
proxy_rotation_strategy=kwargs.get("proxy_rotation_strategy"),
# SSL Parameters
fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False),
# Caching Parameters
@ -827,6 +831,7 @@ class CrawlerRunConfig():
"parser_type": self.parser_type,
"scraping_strategy": self.scraping_strategy,
"proxy_config": self.proxy_config,
"proxy_rotation_strategy": self.proxy_rotation_strategy,
"fetch_ssl_certificate": self.fetch_ssl_certificate,
"cache_mode": self.cache_mode,
"session_id": self.session_id,

View File

@ -394,6 +394,19 @@ class AsyncWebCrawler:
tag="FETCH",
)
# Update proxy configuration from rotation strategy if available
if config and config.proxy_rotation_strategy:
next_proxy = await config.proxy_rotation_strategy.get_next_proxy()
if next_proxy:
if verbose:
self.logger.info(
message="Switch proxy: {proxy}",
tag="PROXY",
params={"proxy": next_proxy.get("server")},
)
config.proxy_config = next_proxy
# config = config.clone(proxy_config=next_proxy)
# Fetch fresh content if needed
if not cached_result or not html:
t1 = time.perf_counter()

View File

@ -0,0 +1,43 @@
from typing import List, Dict, Optional
from abc import ABC, abstractmethod
from itertools import cycle
class ProxyRotationStrategy(ABC):
"""Base abstract class for proxy rotation strategies"""
@abstractmethod
async def get_next_proxy(self) -> Optional[Dict]:
"""Get next proxy configuration from the strategy"""
pass
@abstractmethod
def add_proxies(self, proxies: List[Dict]):
"""Add proxy configurations to the strategy"""
pass
class RoundRobinProxyStrategy(ProxyRotationStrategy):
"""Simple round-robin proxy rotation strategy"""
def __init__(self, proxies: List[Dict] = None):
"""
Initialize with optional list of proxy configurations
Args:
proxies: List of proxy config dictionaries, each containing at least
'server' key with proxy URL
"""
self._proxies = []
self._proxy_cycle = None
if proxies:
self.add_proxies(proxies)
def add_proxies(self, proxies: List[Dict]):
"""Add new proxies to the rotation pool"""
self._proxies.extend(proxies)
self._proxy_cycle = cycle(self._proxies)
async def get_next_proxy(self) -> Optional[Dict]:
"""Get next proxy in round-robin fashion"""
if not self._proxy_cycle:
return None
return next(self._proxy_cycle)

View File

@ -0,0 +1,161 @@
import os
import re
from typing import List, Dict
from crawl4ai import (
AsyncWebCrawler,
BrowserConfig,
CrawlerRunConfig,
CacheMode,
RoundRobinProxyStrategy
)
def load_proxies_from_env() -> List[Dict]:
"""Load proxies from PROXIES environment variable"""
proxies = []
try:
proxy_list = os.getenv("PROXIES", "").split(",")
for proxy in proxy_list:
if not proxy:
continue
ip, port, username, password = proxy.split(":")
proxies.append({
"server": f"http://{ip}:{port}",
"username": username,
"password": password,
"ip": ip # Store original IP for verification
})
except Exception as e:
print(f"Error loading proxies from environment: {e}")
return proxies
async def demo_proxy_rotation():
"""
Proxy Rotation Demo using RoundRobinProxyStrategy
===============================================
Demonstrates proxy rotation using the strategy pattern.
"""
print("\n=== Proxy Rotation Demo (Round Robin) ===")
# Load proxies and create rotation strategy
proxies = load_proxies_from_env()
if not proxies:
print("No proxies found in environment. Set PROXIES env variable!")
return
proxy_strategy = RoundRobinProxyStrategy(proxies)
# Create configs
browser_config = BrowserConfig(headless=True, verbose=False)
run_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
proxy_rotation_strategy=proxy_strategy
)
# Test URLs
urls = ["https://httpbin.org/ip"] * len(proxies) # Test each proxy once
async with AsyncWebCrawler(config=browser_config) as crawler:
for url in urls:
result = await crawler.arun(url=url, config=run_config)
if result.success:
# Extract IP from response
ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
current_proxy = run_config.proxy_config if run_config.proxy_config else None
if current_proxy:
print(f"Proxy {current_proxy['server']} -> Response IP: {ip_match.group(0) if ip_match else 'Not found'}")
verified = ip_match and ip_match.group(0) == current_proxy['ip']
if verified:
print(f"✅ Proxy working! IP matches: {current_proxy['ip']}")
else:
print("❌ Proxy failed or IP mismatch!")
else:
print(f"Request failed: {result.error_message}")
async def demo_proxy_rotation_batch():
"""
Proxy Rotation Demo with Batch Processing
=======================================
Demonstrates proxy rotation using arun_many with memory dispatcher.
"""
print("\n=== Proxy Rotation Batch Demo ===")
try:
# Load proxies and create rotation strategy
proxies = load_proxies_from_env()
if not proxies:
print("No proxies found in environment. Set PROXIES env variable!")
return
proxy_strategy = RoundRobinProxyStrategy(proxies)
# Configurations
browser_config = BrowserConfig(headless=True, verbose=False)
run_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
proxy_rotation_strategy=proxy_strategy,
markdown_generator=DefaultMarkdownGenerator()
)
# Test URLs - multiple requests to test rotation
urls = ["https://httpbin.org/ip"] * (len(proxies) * 2) # Test each proxy twice
print("\n📈 Initializing crawler with proxy rotation...")
async with AsyncWebCrawler(config=browser_config) as crawler:
monitor = CrawlerMonitor(
max_visible_rows=10,
display_mode=DisplayMode.DETAILED
)
dispatcher = MemoryAdaptiveDispatcher(
memory_threshold_percent=80.0,
check_interval=0.5,
max_session_permit=1, #len(proxies), # Match concurrent sessions to proxy count
# monitor=monitor
)
print("\n🚀 Starting batch crawl with proxy rotation...")
results = await crawler.arun_many(
urls=urls,
config=run_config,
dispatcher=dispatcher
)
# Verify results
success_count = 0
for result in results:
if result.success:
ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
current_proxy = run_config.proxy_config if run_config.proxy_config else None
if current_proxy and ip_match:
print(f"URL {result.url}")
print(f"Proxy {current_proxy['server']} -> Response IP: {ip_match.group(0)}")
verified = ip_match.group(0) == current_proxy['ip']
if verified:
print(f"✅ Proxy working! IP matches: {current_proxy['ip']}")
success_count += 1
else:
print("❌ Proxy failed or IP mismatch!")
print("---")
print(f"\n✅ Completed {len(results)} requests with {success_count} successful proxy verifications")
except Exception as e:
print(f"\n❌ Error in proxy rotation batch demo: {str(e)}")
if __name__ == "__main__":
import asyncio
from crawl4ai import (
CrawlerMonitor,
DisplayMode,
MemoryAdaptiveDispatcher,
DefaultMarkdownGenerator
)
async def run_demos():
# await demo_proxy_rotation() # Original single-request demo
await demo_proxy_rotation_batch() # New batch processing demo
asyncio.run(run_demos())

View File

@ -160,41 +160,9 @@ The `arun_many()` method now uses an intelligent dispatcher that:
### 4.2 Example Usage
Check page [Multi-url Crawling](../advanced/multi-url-crawling.md) for a detailed example of how to use `arun_many()`.
```python
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, RateLimitConfig
from crawl4ai.dispatcher import DisplayMode
# Configure browser
browser_cfg = BrowserConfig(headless=True)
# Configure crawler with rate limiting
run_cfg = CrawlerRunConfig(
# Enable rate limiting
enable_rate_limiting=True,
rate_limit_config=RateLimitConfig(
base_delay=(1.0, 2.0), # Random delay between 1-2 seconds
max_delay=30.0, # Maximum delay after rate limit hits
max_retries=2, # Number of retries before giving up
rate_limit_codes=[429, 503] # Status codes that trigger rate limiting
),
# Resource monitoring
memory_threshold_percent=70.0, # Pause if memory exceeds this
check_interval=0.5, # How often to check resources
max_session_permit=3, # Maximum concurrent crawls
display_mode=DisplayMode.DETAILED.value # Show detailed progress
)
urls = [
"https://example.com/page1",
"https://example.com/page2",
"https://example.com/page3"
]
async with AsyncWebCrawler(config=browser_cfg) as crawler:
results = await crawler.arun_many(urls, config=run_cfg)
for result in results:
print(f"URL: {result.url}, Success: {result.success}")
```
### 4.3 Key Features

View File

@ -159,32 +159,7 @@ Use these for link-level content filtering (often to keep crawls “internal”
---
### G) **Rate Limiting & Resource Management**
| **Parameter** | **Type / Default** | **What It Does** |
|------------------------------|----------------------------------------|---------------------------------------------------------------------------------------------------------------------------|
| **`enable_rate_limiting`** | `bool` (default: `False`) | Enable intelligent rate limiting for multiple URLs |
| **`rate_limit_config`** | `RateLimitConfig` (default: `None`) | Configuration for rate limiting behavior |
The `RateLimitConfig` class has these fields:
| **Field** | **Type / Default** | **What It Does** |
|--------------------|----------------------------------------|---------------------------------------------------------------------------------------------------------------------------|
| **`base_delay`** | `Tuple[float, float]` (1.0, 3.0) | Random delay range between requests to the same domain |
| **`max_delay`** | `float` (60.0) | Maximum delay after rate limit detection |
| **`max_retries`** | `int` (3) | Number of retries before giving up on rate-limited requests |
| **`rate_limit_codes`** | `List[int]` ([429, 503]) | HTTP status codes that trigger rate limiting behavior |
| **Parameter** | **Type / Default** | **What It Does** |
|-------------------------------|----------------------------------------|---------------------------------------------------------------------------------------------------------------------------|
| **`memory_threshold_percent`** | `float` (70.0) | Maximum memory usage before pausing new crawls |
| **`check_interval`** | `float` (1.0) | How often to check system resources (in seconds) |
| **`max_session_permit`** | `int` (20) | Maximum number of concurrent crawl sessions |
| **`display_mode`** | `str` (`None`, "DETAILED", "AGGREGATED") | How to display progress information |
---
### H) **Debug & Logging**
### G) **Debug & Logging**
| **Parameter** | **Type / Default** | **What It Does** |
|----------------|--------------------|---------------------------------------------------------------------------|
@ -218,7 +193,7 @@ The `clone()` method is particularly useful when you need slightly different con
```python
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, RateLimitConfig
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
async def main():
# Configure the browser
@ -239,17 +214,6 @@ async def main():
exclude_external_links=True,
wait_for="css:.article-loaded",
screenshot=True,
enable_rate_limiting=True,
rate_limit_config=RateLimitConfig(
base_delay=(1.0, 3.0),
max_delay=60.0,
max_retries=3,
rate_limit_codes=[429, 503]
),
memory_threshold_percent=70.0,
check_interval=1.0,
max_session_permit=20,
display_mode="DETAILED",
stream=True
)

View File

@ -186,23 +186,19 @@ class CrawlerRunConfig:
- If `True`, enables rate limiting for batch processing.
- Requires `rate_limit_config` to be set.
10. **`rate_limit_config`**:
- A `RateLimitConfig` object controlling rate limiting behavior.
- See below for details.
11. **`memory_threshold_percent`**:
10. **`memory_threshold_percent`**:
- The memory threshold (as a percentage) to monitor.
- If exceeded, the crawler will pause or slow down.
12. **`check_interval`**:
11. **`check_interval`**:
- The interval (in seconds) to check system resources.
- Affects how often memory and CPU usage are monitored.
13. **`max_session_permit`**:
12. **`max_session_permit`**:
- The maximum number of concurrent crawl sessions.
- Helps prevent overwhelming the system.
14. **`display_mode`**:
13. **`display_mode`**:
- The display mode for progress information (`DETAILED`, `BRIEF`, etc.).
- Affects how much information is printed during the crawl.
@ -236,58 +232,6 @@ The `clone()` method:
- Leaves the original configuration unchanged
- Perfect for creating variations without repeating all parameters
### Rate Limiting & Resource Management
For batch processing with `arun_many()`, you can enable intelligent rate limiting:
```python
from crawl4ai import RateLimitConfig
config = CrawlerRunConfig(
enable_rate_limiting=True,
rate_limit_config=RateLimitConfig(
base_delay=(1.0, 3.0), # Random delay range
max_delay=60.0, # Max delay after rate limits
max_retries=3, # Retries before giving up
rate_limit_codes=[429, 503] # Status codes to watch
),
memory_threshold_percent=70.0, # Memory threshold
check_interval=1.0, # Resource check interval
max_session_permit=20, # Max concurrent crawls
display_mode="DETAILED" # Progress display mode
)
```
This configuration:
- Implements intelligent rate limiting per domain
- Monitors system resources
- Provides detailed progress information
- Manages concurrent crawls efficiently
**Minimal Example**:
```python
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
crawl_conf = CrawlerRunConfig(
js_code="document.querySelector('button#loadMore')?.click()",
wait_for="css:.loaded-content",
screenshot=True,
enable_rate_limiting=True,
rate_limit_config=RateLimitConfig(
base_delay=(1.0, 3.0),
max_delay=60.0,
max_retries=3,
rate_limit_codes=[429, 503]
),
stream=True # Enable streaming
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url="https://example.com", config=crawl_conf)
print(result.screenshot[:100]) # Base64-encoded PNG snippet
```
---
## 3. Putting It All Together
@ -322,13 +266,6 @@ async def main():
run_conf = CrawlerRunConfig(
extraction_strategy=extraction,
cache_mode=CacheMode.BYPASS,
enable_rate_limiting=True,
rate_limit_config=RateLimitConfig(
base_delay=(1.0, 3.0),
max_delay=60.0,
max_retries=3,
rate_limit_codes=[429, 503]
)
)
async with AsyncWebCrawler(config=browser_conf) as crawler:

View File

@ -31,9 +31,6 @@ import re
import random
from typing import Optional, Dict
from dotenv import load_dotenv
load_dotenv()
from crawl4ai import (
AsyncWebCrawler,
BrowserConfig,
@ -48,6 +45,7 @@ from crawl4ai import (
LLMContentFilter
)
load_dotenv()
async def demo_memory_dispatcher():
"""Demonstrates the new memory-efficient dispatcher system.
@ -283,7 +281,7 @@ async def demo_proxy_rotation():
"""
print("\n=== 8. Proxy Rotation Demo ===")
async def get_next_proxy(proxy_file: str = f"proxies.txt") -> Optional[Dict]:
async def get_next_proxy(proxy_file: str = "proxies.txt") -> Optional[Dict]:
"""Get next proxy from local file"""
try:
proxies = os.getenv("PROXIES", "").split(",")
@ -323,7 +321,7 @@ async def demo_proxy_rotation():
if verified:
print(f"✅ Proxy working! IP matches: {proxy['ip']}")
else:
print(f"❌ Proxy failed or IP mismatch!")
print("❌ Proxy failed or IP mismatch!")
else:
print(f"Failed with proxy {proxy['ip']}")