crawl4ai/docs/examples/dispatcher_example.py

import asyncio, time
from crawl4ai.async_webcrawler import AsyncWebCrawler
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig, RateLimitConfig
from crawl4ai.dispatcher import DisplayMode

async def crawl_with_rate_limiting(urls):
    """
    Example function demonstrating how to use AsyncWebCrawler with rate limiting and resource monitoring.

    Args:
        urls (List[str]): List of URLs to crawl

    Returns:
        List[CrawlResult]: List of crawl results for each URL
    """
    # Configure browser settings
    browser_config = BrowserConfig(
        headless=True,  # Run browser in headless mode
        verbose=False   # Minimize browser logging
    )

    # Configure crawler settings with rate limiting
    run_config = CrawlerRunConfig(
        # Enable rate limiting
        enable_rate_limiting=True,
        rate_limit_config=RateLimitConfig(
            base_delay=(1.0, 2.0),  # Random delay between 1-2 seconds between requests
            max_delay=30.0,         # Maximum delay after rate limit hits
            max_retries=2,          # Number of retries before giving up
            rate_limit_codes=[429, 503]  # HTTP status codes to trigger rate limiting
        ),
        # Resource monitoring settings
        memory_threshold_percent=70.0,  # Pause crawling if memory usage exceeds this
        check_interval=0.5,            # How often to check resource usage
        max_session_permit=10,          # Maximum concurrent crawls
        display_mode=DisplayMode.DETAILED.value  # Show detailed progress
    )

    # Create and use crawler with context manager
    async with AsyncWebCrawler(config=browser_config) as crawler:
        results = await crawler.arun_many(urls, config=run_config)
        return results

def main():
    # Example URLs (replace with real URLs)
    urls = [
        f"https://example.com/page{i}" for i in range(1, 40)
    ]

    start = time.perf_counter()

    # Run the crawler
    results = asyncio.run(crawl_with_rate_limiting(urls))

    # Process results
    successful_results = [result for result in results if result.success]
    failed_results = [result for result in results if not result.success]

    end = time.perf_counter()

    # Print results
    print(f"Successful crawls: {len(successful_results)}")
    print(f"Failed crawls: {len(failed_results)}")
    print(f"Time taken: {end - start:.2f} seconds")

if __name__ == "__main__":
    main()