crawl4ai/docs/examples/dispatcher_example.py
UncleCode ac5f461d40 feat(crawler): add memory-adaptive dispatcher with rate limiting
Implements a new MemoryAdaptiveDispatcher class to manage concurrent crawling operations with memory monitoring and rate limiting capabilities. Changes include:

- Added RateLimitConfig dataclass for configuring rate limiting behavior
- Extended CrawlerRunConfig with dispatcher-related settings
- Refactored arun_many to use the new dispatcher system
- Added memory threshold and session permit controls
- Integrated optional progress monitoring display

BREAKING CHANGE: The arun_many method now uses MemoryAdaptiveDispatcher by default, which may affect concurrent crawling behavior
2025-01-10 16:01:18 +08:00

68 lines
2.4 KiB
Python

import asyncio, time
from crawl4ai.async_webcrawler import AsyncWebCrawler
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig, RateLimitConfig
from crawl4ai.dispatcher import DisplayMode
async def crawl_with_rate_limiting(urls):
"""
Example function demonstrating how to use AsyncWebCrawler with rate limiting and resource monitoring.
Args:
urls (List[str]): List of URLs to crawl
Returns:
List[CrawlResult]: List of crawl results for each URL
"""
# Configure browser settings
browser_config = BrowserConfig(
headless=True, # Run browser in headless mode
verbose=False # Minimize browser logging
)
# Configure crawler settings with rate limiting
run_config = CrawlerRunConfig(
# Enable rate limiting
enable_rate_limiting=True,
rate_limit_config=RateLimitConfig(
base_delay=(1.0, 2.0), # Random delay between 1-2 seconds between requests
max_delay=30.0, # Maximum delay after rate limit hits
max_retries=2, # Number of retries before giving up
rate_limit_codes=[429, 503] # HTTP status codes to trigger rate limiting
),
# Resource monitoring settings
memory_threshold_percent=70.0, # Pause crawling if memory usage exceeds this
check_interval=0.5, # How often to check resource usage
max_session_permit=10, # Maximum concurrent crawls
display_mode=DisplayMode.DETAILED.value # Show detailed progress
)
# Create and use crawler with context manager
async with AsyncWebCrawler(config=browser_config) as crawler:
results = await crawler.arun_many(urls, config=run_config)
return results
def main():
# Example URLs (replace with real URLs)
urls = [
f"https://example.com/page{i}" for i in range(1, 40)
]
start = time.perf_counter()
# Run the crawler
results = asyncio.run(crawl_with_rate_limiting(urls))
# Process results
successful_results = [result for result in results if result.success]
failed_results = [result for result in results if not result.success]
end = time.perf_counter()
# Print results
print(f"Successful crawls: {len(successful_results)}")
print(f"Failed crawls: {len(failed_results)}")
print(f"Time taken: {end - start:.2f} seconds")
if __name__ == "__main__":
main()