crawl4ai/docs/examples/dispatcher_example.py

import asyncio, time
from crawl4ai.async_webcrawler import AsyncWebCrawler
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig, RateLimitConfig
from crawl4ai.dispatcher import DisplayMode

async def crawl_with_rate_limiting(urls):
    """
    Example function demonstrating how to use AsyncWebCrawler with rate limiting and resource monitoring.
    
    Args:
        urls (List[str]): List of URLs to crawl
        
    Returns:
        List[CrawlResult]: List of crawl results for each URL
    """
    # Configure browser settings
    browser_config = BrowserConfig(
        headless=True,  # Run browser in headless mode
        verbose=False   # Minimize browser logging
    )
    
    # Configure crawler settings with rate limiting
    run_config = CrawlerRunConfig(
        # Enable rate limiting
        enable_rate_limiting=True,
        rate_limit_config=RateLimitConfig(
            base_delay=(1.0, 2.0),  # Random delay between 1-2 seconds between requests
            max_delay=30.0,         # Maximum delay after rate limit hits
            max_retries=2,          # Number of retries before giving up
            rate_limit_codes=[429, 503]  # HTTP status codes to trigger rate limiting
        ),
        # Resource monitoring settings
        memory_threshold_percent=70.0,  # Pause crawling if memory usage exceeds this
        check_interval=0.5,            # How often to check resource usage
        max_session_permit=10,          # Maximum concurrent crawls
        display_mode=DisplayMode.DETAILED.value  # Show detailed progress
    )
    
    # Create and use crawler with context manager
    async with AsyncWebCrawler(config=browser_config) as crawler:
        results = await crawler.arun_many(urls, config=run_config)
        return results

def main():
    # Example URLs (replace with real URLs)
    urls = [
        f"https://example.com/page{i}" for i in range(1, 40)
    ]
    
    start = time.perf_counter()
    
    # Run the crawler
    results = asyncio.run(crawl_with_rate_limiting(urls))
    
    # Process results
    successful_results = [result for result in results if result.success]
    failed_results = [result for result in results if not result.success]
    
    end = time.perf_counter()
    
    # Print results
    print(f"Successful crawls: {len(successful_results)}")
    print(f"Failed crawls: {len(failed_results)}")
    print(f"Time taken: {end - start:.2f} seconds")

if __name__ == "__main__":
    main()
feat(crawler): add memory-adaptive dispatcher with rate limiting Implements a new MemoryAdaptiveDispatcher class to manage concurrent crawling operations with memory monitoring and rate limiting capabilities. Changes include: - Added RateLimitConfig dataclass for configuring rate limiting behavior - Extended CrawlerRunConfig with dispatcher-related settings - Refactored arun_many to use the new dispatcher system - Added memory threshold and session permit controls - Integrated optional progress monitoring display BREAKING CHANGE: The arun_many method now uses MemoryAdaptiveDispatcher by default, which may affect concurrent crawling behavior 2025-01-10 16:01:18 +08:00			`import asyncio, time`
			`from crawl4ai.async_webcrawler import AsyncWebCrawler`
			`from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig, RateLimitConfig`
			`from crawl4ai.dispatcher import DisplayMode`

			`async def crawl_with_rate_limiting(urls):`
			`"""`
			`Example function demonstrating how to use AsyncWebCrawler with rate limiting and resource monitoring.`

			`Args:`
			`urls (List[str]): List of URLs to crawl`

			`Returns:`
			`List[CrawlResult]: List of crawl results for each URL`
			`"""`
			`# Configure browser settings`
			`browser_config = BrowserConfig(`
			`headless=True, # Run browser in headless mode`
			`verbose=False # Minimize browser logging`
			`)`

			`# Configure crawler settings with rate limiting`
			`run_config = CrawlerRunConfig(`
			`# Enable rate limiting`
			`enable_rate_limiting=True,`
			`rate_limit_config=RateLimitConfig(`
			`base_delay=(1.0, 2.0), # Random delay between 1-2 seconds between requests`
			`max_delay=30.0, # Maximum delay after rate limit hits`
			`max_retries=2, # Number of retries before giving up`
			`rate_limit_codes=[429, 503] # HTTP status codes to trigger rate limiting`
			`),`
			`# Resource monitoring settings`
			`memory_threshold_percent=70.0, # Pause crawling if memory usage exceeds this`
			`check_interval=0.5, # How often to check resource usage`
			`max_session_permit=10, # Maximum concurrent crawls`
			`display_mode=DisplayMode.DETAILED.value # Show detailed progress`
			`)`

			`# Create and use crawler with context manager`
			`async with AsyncWebCrawler(config=browser_config) as crawler:`
			`results = await crawler.arun_many(urls, config=run_config)`
			`return results`

			`def main():`
			`# Example URLs (replace with real URLs)`
			`urls = [`
			`f"https://example.com/page{i}" for i in range(1, 40)`
			`]`

			`start = time.perf_counter()`

			`# Run the crawler`
			`results = asyncio.run(crawl_with_rate_limiting(urls))`

			`# Process results`
			`successful_results = [result for result in results if result.success]`
			`failed_results = [result for result in results if not result.success]`

			`end = time.perf_counter()`

			`# Print results`
			`print(f"Successful crawls: {len(successful_results)}")`
			`print(f"Failed crawls: {len(failed_results)}")`
			`print(f"Time taken: {end - start:.2f} seconds")`

			`if __name__ == "__main__":`
			`main()`