mirror of
https://github.com/unclecode/crawl4ai.git
synced 2025-12-28 10:58:29 +00:00
feat(proxy): add proxy rotation strategy
Implements a new proxy rotation system with the following changes: - Add ProxyRotationStrategy abstract base class - Add RoundRobinProxyStrategy concrete implementation - Integrate proxy rotation with AsyncWebCrawler - Add proxy_rotation_strategy parameter to CrawlerRunConfig - Add example script demonstrating proxy rotation usage - Remove deprecated synchronous WebCrawler code - Clean up rate limiting documentation BREAKING CHANGE: Removed synchronous WebCrawler support and related rate limiting configurations
This commit is contained in:
parent
b957ff2ecd
commit
19df96ed56
@ -8,6 +8,10 @@ from .content_scraping_strategy import (
|
||||
WebScrapingStrategy,
|
||||
LXMLWebScrapingStrategy,
|
||||
)
|
||||
from .proxy_strategy import (
|
||||
ProxyRotationStrategy,
|
||||
RoundRobinProxyStrategy,
|
||||
)
|
||||
from .extraction_strategy import (
|
||||
ExtractionStrategy,
|
||||
LLMExtractionStrategy,
|
||||
@ -60,31 +64,33 @@ __all__ = [
|
||||
"DisplayMode",
|
||||
"MarkdownGenerationResult",
|
||||
"Crawl4aiDockerClient",
|
||||
"ProxyRotationStrategy",
|
||||
"RoundRobinProxyStrategy",
|
||||
]
|
||||
|
||||
|
||||
def is_sync_version_installed():
|
||||
try:
|
||||
import selenium # noqa
|
||||
# def is_sync_version_installed():
|
||||
# try:
|
||||
# import selenium # noqa
|
||||
|
||||
return True
|
||||
except ImportError:
|
||||
return False
|
||||
# return True
|
||||
# except ImportError:
|
||||
# return False
|
||||
|
||||
|
||||
if is_sync_version_installed():
|
||||
try:
|
||||
from .web_crawler import WebCrawler
|
||||
# if is_sync_version_installed():
|
||||
# try:
|
||||
# from .web_crawler import WebCrawler
|
||||
|
||||
__all__.append("WebCrawler")
|
||||
except ImportError:
|
||||
print(
|
||||
"Warning: Failed to import WebCrawler even though selenium is installed. This might be due to other missing dependencies."
|
||||
)
|
||||
else:
|
||||
WebCrawler = None
|
||||
# import warnings
|
||||
# print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.")
|
||||
# __all__.append("WebCrawler")
|
||||
# except ImportError:
|
||||
# print(
|
||||
# "Warning: Failed to import WebCrawler even though selenium is installed. This might be due to other missing dependencies."
|
||||
# )
|
||||
# else:
|
||||
# WebCrawler = None
|
||||
# # import warnings
|
||||
# # print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.")
|
||||
|
||||
# Disable all Pydantic warnings
|
||||
warnings.filterwarnings("ignore", module="pydantic")
|
||||
|
||||
@ -16,6 +16,7 @@ from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrat
|
||||
from .deep_crawling import DeepCrawlStrategy
|
||||
from typing import Union, List
|
||||
from .cache_context import CacheMode
|
||||
from .proxy_strategy import ProxyRotationStrategy
|
||||
|
||||
import inspect
|
||||
from typing import Any, Dict, Optional
|
||||
@ -542,6 +543,7 @@ class CrawlerRunConfig():
|
||||
parser_type: str = "lxml",
|
||||
scraping_strategy: ContentScrapingStrategy = None,
|
||||
proxy_config: dict = None,
|
||||
proxy_rotation_strategy: Optional[ProxyRotationStrategy] = None,
|
||||
# SSL Parameters
|
||||
fetch_ssl_certificate: bool = False,
|
||||
# Caching Parameters
|
||||
@ -620,6 +622,7 @@ class CrawlerRunConfig():
|
||||
self.parser_type = parser_type
|
||||
self.scraping_strategy = scraping_strategy or WebScrapingStrategy()
|
||||
self.proxy_config = proxy_config
|
||||
self.proxy_rotation_strategy = proxy_rotation_strategy
|
||||
|
||||
# SSL Parameters
|
||||
self.fetch_ssl_certificate = fetch_ssl_certificate
|
||||
@ -731,6 +734,7 @@ class CrawlerRunConfig():
|
||||
parser_type=kwargs.get("parser_type", "lxml"),
|
||||
scraping_strategy=kwargs.get("scraping_strategy"),
|
||||
proxy_config=kwargs.get("proxy_config"),
|
||||
proxy_rotation_strategy=kwargs.get("proxy_rotation_strategy"),
|
||||
# SSL Parameters
|
||||
fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False),
|
||||
# Caching Parameters
|
||||
@ -827,6 +831,7 @@ class CrawlerRunConfig():
|
||||
"parser_type": self.parser_type,
|
||||
"scraping_strategy": self.scraping_strategy,
|
||||
"proxy_config": self.proxy_config,
|
||||
"proxy_rotation_strategy": self.proxy_rotation_strategy,
|
||||
"fetch_ssl_certificate": self.fetch_ssl_certificate,
|
||||
"cache_mode": self.cache_mode,
|
||||
"session_id": self.session_id,
|
||||
|
||||
@ -394,6 +394,19 @@ class AsyncWebCrawler:
|
||||
tag="FETCH",
|
||||
)
|
||||
|
||||
# Update proxy configuration from rotation strategy if available
|
||||
if config and config.proxy_rotation_strategy:
|
||||
next_proxy = await config.proxy_rotation_strategy.get_next_proxy()
|
||||
if next_proxy:
|
||||
if verbose:
|
||||
self.logger.info(
|
||||
message="Switch proxy: {proxy}",
|
||||
tag="PROXY",
|
||||
params={"proxy": next_proxy.get("server")},
|
||||
)
|
||||
config.proxy_config = next_proxy
|
||||
# config = config.clone(proxy_config=next_proxy)
|
||||
|
||||
# Fetch fresh content if needed
|
||||
if not cached_result or not html:
|
||||
t1 = time.perf_counter()
|
||||
|
||||
43
crawl4ai/proxy_strategy.py
Normal file
43
crawl4ai/proxy_strategy.py
Normal file
@ -0,0 +1,43 @@
|
||||
from typing import List, Dict, Optional
|
||||
from abc import ABC, abstractmethod
|
||||
from itertools import cycle
|
||||
|
||||
class ProxyRotationStrategy(ABC):
|
||||
"""Base abstract class for proxy rotation strategies"""
|
||||
|
||||
@abstractmethod
|
||||
async def get_next_proxy(self) -> Optional[Dict]:
|
||||
"""Get next proxy configuration from the strategy"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def add_proxies(self, proxies: List[Dict]):
|
||||
"""Add proxy configurations to the strategy"""
|
||||
pass
|
||||
|
||||
class RoundRobinProxyStrategy(ProxyRotationStrategy):
|
||||
"""Simple round-robin proxy rotation strategy"""
|
||||
|
||||
def __init__(self, proxies: List[Dict] = None):
|
||||
"""
|
||||
Initialize with optional list of proxy configurations
|
||||
|
||||
Args:
|
||||
proxies: List of proxy config dictionaries, each containing at least
|
||||
'server' key with proxy URL
|
||||
"""
|
||||
self._proxies = []
|
||||
self._proxy_cycle = None
|
||||
if proxies:
|
||||
self.add_proxies(proxies)
|
||||
|
||||
def add_proxies(self, proxies: List[Dict]):
|
||||
"""Add new proxies to the rotation pool"""
|
||||
self._proxies.extend(proxies)
|
||||
self._proxy_cycle = cycle(self._proxies)
|
||||
|
||||
async def get_next_proxy(self) -> Optional[Dict]:
|
||||
"""Get next proxy in round-robin fashion"""
|
||||
if not self._proxy_cycle:
|
||||
return None
|
||||
return next(self._proxy_cycle)
|
||||
161
docs/examples/proxy_rotation_demo.py
Normal file
161
docs/examples/proxy_rotation_demo.py
Normal file
@ -0,0 +1,161 @@
|
||||
import os
|
||||
import re
|
||||
from typing import List, Dict
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
BrowserConfig,
|
||||
CrawlerRunConfig,
|
||||
CacheMode,
|
||||
RoundRobinProxyStrategy
|
||||
)
|
||||
|
||||
def load_proxies_from_env() -> List[Dict]:
|
||||
"""Load proxies from PROXIES environment variable"""
|
||||
proxies = []
|
||||
try:
|
||||
proxy_list = os.getenv("PROXIES", "").split(",")
|
||||
for proxy in proxy_list:
|
||||
if not proxy:
|
||||
continue
|
||||
ip, port, username, password = proxy.split(":")
|
||||
proxies.append({
|
||||
"server": f"http://{ip}:{port}",
|
||||
"username": username,
|
||||
"password": password,
|
||||
"ip": ip # Store original IP for verification
|
||||
})
|
||||
except Exception as e:
|
||||
print(f"Error loading proxies from environment: {e}")
|
||||
return proxies
|
||||
|
||||
async def demo_proxy_rotation():
|
||||
"""
|
||||
Proxy Rotation Demo using RoundRobinProxyStrategy
|
||||
===============================================
|
||||
Demonstrates proxy rotation using the strategy pattern.
|
||||
"""
|
||||
print("\n=== Proxy Rotation Demo (Round Robin) ===")
|
||||
|
||||
# Load proxies and create rotation strategy
|
||||
proxies = load_proxies_from_env()
|
||||
if not proxies:
|
||||
print("No proxies found in environment. Set PROXIES env variable!")
|
||||
return
|
||||
|
||||
proxy_strategy = RoundRobinProxyStrategy(proxies)
|
||||
|
||||
# Create configs
|
||||
browser_config = BrowserConfig(headless=True, verbose=False)
|
||||
run_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
proxy_rotation_strategy=proxy_strategy
|
||||
)
|
||||
|
||||
# Test URLs
|
||||
urls = ["https://httpbin.org/ip"] * len(proxies) # Test each proxy once
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
for url in urls:
|
||||
result = await crawler.arun(url=url, config=run_config)
|
||||
|
||||
if result.success:
|
||||
# Extract IP from response
|
||||
ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
|
||||
current_proxy = run_config.proxy_config if run_config.proxy_config else None
|
||||
|
||||
if current_proxy:
|
||||
print(f"Proxy {current_proxy['server']} -> Response IP: {ip_match.group(0) if ip_match else 'Not found'}")
|
||||
verified = ip_match and ip_match.group(0) == current_proxy['ip']
|
||||
if verified:
|
||||
print(f"✅ Proxy working! IP matches: {current_proxy['ip']}")
|
||||
else:
|
||||
print("❌ Proxy failed or IP mismatch!")
|
||||
else:
|
||||
print(f"Request failed: {result.error_message}")
|
||||
|
||||
async def demo_proxy_rotation_batch():
|
||||
"""
|
||||
Proxy Rotation Demo with Batch Processing
|
||||
=======================================
|
||||
Demonstrates proxy rotation using arun_many with memory dispatcher.
|
||||
"""
|
||||
print("\n=== Proxy Rotation Batch Demo ===")
|
||||
|
||||
try:
|
||||
# Load proxies and create rotation strategy
|
||||
proxies = load_proxies_from_env()
|
||||
if not proxies:
|
||||
print("No proxies found in environment. Set PROXIES env variable!")
|
||||
return
|
||||
|
||||
proxy_strategy = RoundRobinProxyStrategy(proxies)
|
||||
|
||||
# Configurations
|
||||
browser_config = BrowserConfig(headless=True, verbose=False)
|
||||
run_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
proxy_rotation_strategy=proxy_strategy,
|
||||
markdown_generator=DefaultMarkdownGenerator()
|
||||
)
|
||||
|
||||
# Test URLs - multiple requests to test rotation
|
||||
urls = ["https://httpbin.org/ip"] * (len(proxies) * 2) # Test each proxy twice
|
||||
|
||||
print("\n📈 Initializing crawler with proxy rotation...")
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
monitor = CrawlerMonitor(
|
||||
max_visible_rows=10,
|
||||
display_mode=DisplayMode.DETAILED
|
||||
)
|
||||
|
||||
dispatcher = MemoryAdaptiveDispatcher(
|
||||
memory_threshold_percent=80.0,
|
||||
check_interval=0.5,
|
||||
max_session_permit=1, #len(proxies), # Match concurrent sessions to proxy count
|
||||
# monitor=monitor
|
||||
)
|
||||
|
||||
print("\n🚀 Starting batch crawl with proxy rotation...")
|
||||
results = await crawler.arun_many(
|
||||
urls=urls,
|
||||
config=run_config,
|
||||
dispatcher=dispatcher
|
||||
)
|
||||
|
||||
# Verify results
|
||||
success_count = 0
|
||||
for result in results:
|
||||
if result.success:
|
||||
ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
|
||||
current_proxy = run_config.proxy_config if run_config.proxy_config else None
|
||||
|
||||
if current_proxy and ip_match:
|
||||
print(f"URL {result.url}")
|
||||
print(f"Proxy {current_proxy['server']} -> Response IP: {ip_match.group(0)}")
|
||||
verified = ip_match.group(0) == current_proxy['ip']
|
||||
if verified:
|
||||
print(f"✅ Proxy working! IP matches: {current_proxy['ip']}")
|
||||
success_count += 1
|
||||
else:
|
||||
print("❌ Proxy failed or IP mismatch!")
|
||||
print("---")
|
||||
|
||||
print(f"\n✅ Completed {len(results)} requests with {success_count} successful proxy verifications")
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n❌ Error in proxy rotation batch demo: {str(e)}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
import asyncio
|
||||
from crawl4ai import (
|
||||
CrawlerMonitor,
|
||||
DisplayMode,
|
||||
MemoryAdaptiveDispatcher,
|
||||
DefaultMarkdownGenerator
|
||||
)
|
||||
|
||||
async def run_demos():
|
||||
# await demo_proxy_rotation() # Original single-request demo
|
||||
await demo_proxy_rotation_batch() # New batch processing demo
|
||||
|
||||
asyncio.run(run_demos())
|
||||
@ -160,41 +160,9 @@ The `arun_many()` method now uses an intelligent dispatcher that:
|
||||
|
||||
### 4.2 Example Usage
|
||||
|
||||
Check page [Multi-url Crawling](../advanced/multi-url-crawling.md) for a detailed example of how to use `arun_many()`.
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, RateLimitConfig
|
||||
from crawl4ai.dispatcher import DisplayMode
|
||||
|
||||
# Configure browser
|
||||
browser_cfg = BrowserConfig(headless=True)
|
||||
|
||||
# Configure crawler with rate limiting
|
||||
run_cfg = CrawlerRunConfig(
|
||||
# Enable rate limiting
|
||||
enable_rate_limiting=True,
|
||||
rate_limit_config=RateLimitConfig(
|
||||
base_delay=(1.0, 2.0), # Random delay between 1-2 seconds
|
||||
max_delay=30.0, # Maximum delay after rate limit hits
|
||||
max_retries=2, # Number of retries before giving up
|
||||
rate_limit_codes=[429, 503] # Status codes that trigger rate limiting
|
||||
),
|
||||
# Resource monitoring
|
||||
memory_threshold_percent=70.0, # Pause if memory exceeds this
|
||||
check_interval=0.5, # How often to check resources
|
||||
max_session_permit=3, # Maximum concurrent crawls
|
||||
display_mode=DisplayMode.DETAILED.value # Show detailed progress
|
||||
)
|
||||
|
||||
urls = [
|
||||
"https://example.com/page1",
|
||||
"https://example.com/page2",
|
||||
"https://example.com/page3"
|
||||
]
|
||||
|
||||
async with AsyncWebCrawler(config=browser_cfg) as crawler:
|
||||
results = await crawler.arun_many(urls, config=run_cfg)
|
||||
for result in results:
|
||||
print(f"URL: {result.url}, Success: {result.success}")
|
||||
```
|
||||
|
||||
### 4.3 Key Features
|
||||
|
||||
|
||||
@ -159,32 +159,7 @@ Use these for link-level content filtering (often to keep crawls “internal”
|
||||
|
||||
---
|
||||
|
||||
### G) **Rate Limiting & Resource Management**
|
||||
|
||||
| **Parameter** | **Type / Default** | **What It Does** |
|
||||
|------------------------------|----------------------------------------|---------------------------------------------------------------------------------------------------------------------------|
|
||||
| **`enable_rate_limiting`** | `bool` (default: `False`) | Enable intelligent rate limiting for multiple URLs |
|
||||
| **`rate_limit_config`** | `RateLimitConfig` (default: `None`) | Configuration for rate limiting behavior |
|
||||
|
||||
The `RateLimitConfig` class has these fields:
|
||||
|
||||
| **Field** | **Type / Default** | **What It Does** |
|
||||
|--------------------|----------------------------------------|---------------------------------------------------------------------------------------------------------------------------|
|
||||
| **`base_delay`** | `Tuple[float, float]` (1.0, 3.0) | Random delay range between requests to the same domain |
|
||||
| **`max_delay`** | `float` (60.0) | Maximum delay after rate limit detection |
|
||||
| **`max_retries`** | `int` (3) | Number of retries before giving up on rate-limited requests |
|
||||
| **`rate_limit_codes`** | `List[int]` ([429, 503]) | HTTP status codes that trigger rate limiting behavior |
|
||||
|
||||
| **Parameter** | **Type / Default** | **What It Does** |
|
||||
|-------------------------------|----------------------------------------|---------------------------------------------------------------------------------------------------------------------------|
|
||||
| **`memory_threshold_percent`** | `float` (70.0) | Maximum memory usage before pausing new crawls |
|
||||
| **`check_interval`** | `float` (1.0) | How often to check system resources (in seconds) |
|
||||
| **`max_session_permit`** | `int` (20) | Maximum number of concurrent crawl sessions |
|
||||
| **`display_mode`** | `str` (`None`, "DETAILED", "AGGREGATED") | How to display progress information |
|
||||
|
||||
---
|
||||
|
||||
### H) **Debug & Logging**
|
||||
### G) **Debug & Logging**
|
||||
|
||||
| **Parameter** | **Type / Default** | **What It Does** |
|
||||
|----------------|--------------------|---------------------------------------------------------------------------|
|
||||
@ -218,7 +193,7 @@ The `clone()` method is particularly useful when you need slightly different con
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, RateLimitConfig
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
|
||||
async def main():
|
||||
# Configure the browser
|
||||
@ -239,17 +214,6 @@ async def main():
|
||||
exclude_external_links=True,
|
||||
wait_for="css:.article-loaded",
|
||||
screenshot=True,
|
||||
enable_rate_limiting=True,
|
||||
rate_limit_config=RateLimitConfig(
|
||||
base_delay=(1.0, 3.0),
|
||||
max_delay=60.0,
|
||||
max_retries=3,
|
||||
rate_limit_codes=[429, 503]
|
||||
),
|
||||
memory_threshold_percent=70.0,
|
||||
check_interval=1.0,
|
||||
max_session_permit=20,
|
||||
display_mode="DETAILED",
|
||||
stream=True
|
||||
)
|
||||
|
||||
|
||||
@ -186,23 +186,19 @@ class CrawlerRunConfig:
|
||||
- If `True`, enables rate limiting for batch processing.
|
||||
- Requires `rate_limit_config` to be set.
|
||||
|
||||
10. **`rate_limit_config`**:
|
||||
- A `RateLimitConfig` object controlling rate limiting behavior.
|
||||
- See below for details.
|
||||
|
||||
11. **`memory_threshold_percent`**:
|
||||
10. **`memory_threshold_percent`**:
|
||||
- The memory threshold (as a percentage) to monitor.
|
||||
- If exceeded, the crawler will pause or slow down.
|
||||
|
||||
12. **`check_interval`**:
|
||||
11. **`check_interval`**:
|
||||
- The interval (in seconds) to check system resources.
|
||||
- Affects how often memory and CPU usage are monitored.
|
||||
|
||||
13. **`max_session_permit`**:
|
||||
12. **`max_session_permit`**:
|
||||
- The maximum number of concurrent crawl sessions.
|
||||
- Helps prevent overwhelming the system.
|
||||
|
||||
14. **`display_mode`**:
|
||||
13. **`display_mode`**:
|
||||
- The display mode for progress information (`DETAILED`, `BRIEF`, etc.).
|
||||
- Affects how much information is printed during the crawl.
|
||||
|
||||
@ -236,58 +232,6 @@ The `clone()` method:
|
||||
- Leaves the original configuration unchanged
|
||||
- Perfect for creating variations without repeating all parameters
|
||||
|
||||
### Rate Limiting & Resource Management
|
||||
|
||||
For batch processing with `arun_many()`, you can enable intelligent rate limiting:
|
||||
|
||||
```python
|
||||
from crawl4ai import RateLimitConfig
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
enable_rate_limiting=True,
|
||||
rate_limit_config=RateLimitConfig(
|
||||
base_delay=(1.0, 3.0), # Random delay range
|
||||
max_delay=60.0, # Max delay after rate limits
|
||||
max_retries=3, # Retries before giving up
|
||||
rate_limit_codes=[429, 503] # Status codes to watch
|
||||
),
|
||||
memory_threshold_percent=70.0, # Memory threshold
|
||||
check_interval=1.0, # Resource check interval
|
||||
max_session_permit=20, # Max concurrent crawls
|
||||
display_mode="DETAILED" # Progress display mode
|
||||
)
|
||||
```
|
||||
|
||||
This configuration:
|
||||
- Implements intelligent rate limiting per domain
|
||||
- Monitors system resources
|
||||
- Provides detailed progress information
|
||||
- Manages concurrent crawls efficiently
|
||||
|
||||
**Minimal Example**:
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
|
||||
crawl_conf = CrawlerRunConfig(
|
||||
js_code="document.querySelector('button#loadMore')?.click()",
|
||||
wait_for="css:.loaded-content",
|
||||
screenshot=True,
|
||||
enable_rate_limiting=True,
|
||||
rate_limit_config=RateLimitConfig(
|
||||
base_delay=(1.0, 3.0),
|
||||
max_delay=60.0,
|
||||
max_retries=3,
|
||||
rate_limit_codes=[429, 503]
|
||||
),
|
||||
stream=True # Enable streaming
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(url="https://example.com", config=crawl_conf)
|
||||
print(result.screenshot[:100]) # Base64-encoded PNG snippet
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 3. Putting It All Together
|
||||
@ -322,13 +266,6 @@ async def main():
|
||||
run_conf = CrawlerRunConfig(
|
||||
extraction_strategy=extraction,
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
enable_rate_limiting=True,
|
||||
rate_limit_config=RateLimitConfig(
|
||||
base_delay=(1.0, 3.0),
|
||||
max_delay=60.0,
|
||||
max_retries=3,
|
||||
rate_limit_codes=[429, 503]
|
||||
)
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_conf) as crawler:
|
||||
|
||||
@ -31,9 +31,6 @@ import re
|
||||
import random
|
||||
from typing import Optional, Dict
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
BrowserConfig,
|
||||
@ -48,6 +45,7 @@ from crawl4ai import (
|
||||
LLMContentFilter
|
||||
)
|
||||
|
||||
load_dotenv()
|
||||
|
||||
async def demo_memory_dispatcher():
|
||||
"""Demonstrates the new memory-efficient dispatcher system.
|
||||
@ -283,7 +281,7 @@ async def demo_proxy_rotation():
|
||||
"""
|
||||
print("\n=== 8. Proxy Rotation Demo ===")
|
||||
|
||||
async def get_next_proxy(proxy_file: str = f"proxies.txt") -> Optional[Dict]:
|
||||
async def get_next_proxy(proxy_file: str = "proxies.txt") -> Optional[Dict]:
|
||||
"""Get next proxy from local file"""
|
||||
try:
|
||||
proxies = os.getenv("PROXIES", "").split(",")
|
||||
@ -323,7 +321,7 @@ async def demo_proxy_rotation():
|
||||
if verified:
|
||||
print(f"✅ Proxy working! IP matches: {proxy['ip']}")
|
||||
else:
|
||||
print(f"❌ Proxy failed or IP mismatch!")
|
||||
print("❌ Proxy failed or IP mismatch!")
|
||||
else:
|
||||
print(f"Failed with proxy {proxy['ip']}")
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user