68 lines
2.4 KiB
Python
68 lines
2.4 KiB
Python
![]() |
import asyncio, time
|
||
|
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
||
|
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig, RateLimitConfig
|
||
|
from crawl4ai.dispatcher import DisplayMode
|
||
|
|
||
|
async def crawl_with_rate_limiting(urls):
|
||
|
"""
|
||
|
Example function demonstrating how to use AsyncWebCrawler with rate limiting and resource monitoring.
|
||
|
|
||
|
Args:
|
||
|
urls (List[str]): List of URLs to crawl
|
||
|
|
||
|
Returns:
|
||
|
List[CrawlResult]: List of crawl results for each URL
|
||
|
"""
|
||
|
# Configure browser settings
|
||
|
browser_config = BrowserConfig(
|
||
|
headless=True, # Run browser in headless mode
|
||
|
verbose=False # Minimize browser logging
|
||
|
)
|
||
|
|
||
|
# Configure crawler settings with rate limiting
|
||
|
run_config = CrawlerRunConfig(
|
||
|
# Enable rate limiting
|
||
|
enable_rate_limiting=True,
|
||
|
rate_limit_config=RateLimitConfig(
|
||
|
base_delay=(1.0, 2.0), # Random delay between 1-2 seconds between requests
|
||
|
max_delay=30.0, # Maximum delay after rate limit hits
|
||
|
max_retries=2, # Number of retries before giving up
|
||
|
rate_limit_codes=[429, 503] # HTTP status codes to trigger rate limiting
|
||
|
),
|
||
|
# Resource monitoring settings
|
||
|
memory_threshold_percent=70.0, # Pause crawling if memory usage exceeds this
|
||
|
check_interval=0.5, # How often to check resource usage
|
||
|
max_session_permit=10, # Maximum concurrent crawls
|
||
|
display_mode=DisplayMode.DETAILED.value # Show detailed progress
|
||
|
)
|
||
|
|
||
|
# Create and use crawler with context manager
|
||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||
|
results = await crawler.arun_many(urls, config=run_config)
|
||
|
return results
|
||
|
|
||
|
def main():
|
||
|
# Example URLs (replace with real URLs)
|
||
|
urls = [
|
||
|
f"https://example.com/page{i}" for i in range(1, 40)
|
||
|
]
|
||
|
|
||
|
start = time.perf_counter()
|
||
|
|
||
|
# Run the crawler
|
||
|
results = asyncio.run(crawl_with_rate_limiting(urls))
|
||
|
|
||
|
# Process results
|
||
|
successful_results = [result for result in results if result.success]
|
||
|
failed_results = [result for result in results if not result.success]
|
||
|
|
||
|
end = time.perf_counter()
|
||
|
|
||
|
# Print results
|
||
|
print(f"Successful crawls: {len(successful_results)}")
|
||
|
print(f"Failed crawls: {len(failed_results)}")
|
||
|
print(f"Time taken: {end - start:.2f} seconds")
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
main()
|