crawl4ai/docs/examples/arun_vs_arun_many.py
UncleCode b750542e6d feat(crawler): optimize single URL handling and add performance comparison
Add special handling for single URL requests in Docker API to use arun() instead of arun_many()
Add new example script demonstrating performance differences between sequential and parallel crawling
Update cache mode from aggressive to bypass in examples and tests
Remove unused dependencies (zstandard, msgpack)

BREAKING CHANGE: Changed default cache_mode from aggressive to bypass in examples
2025-03-13 22:15:15 +08:00

80 lines
3.4 KiB
Python

import asyncio
import time
from crawl4ai.async_webcrawler import AsyncWebCrawler, CacheMode
from crawl4ai.async_configs import CrawlerRunConfig
from crawl4ai.async_dispatcher import MemoryAdaptiveDispatcher, RateLimiter
VERBOSE = False
async def crawl_sequential(urls):
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=VERBOSE)
results = []
start_time = time.perf_counter()
async with AsyncWebCrawler() as crawler:
for url in urls:
result_container = await crawler.arun(url=url, config=config)
results.append(result_container[0])
total_time = time.perf_counter() - start_time
return total_time, results
async def crawl_parallel_dispatcher(urls):
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=VERBOSE)
# Dispatcher with rate limiter enabled (default behavior)
dispatcher = MemoryAdaptiveDispatcher(
rate_limiter=RateLimiter(base_delay=(1.0, 3.0), max_delay=60.0, max_retries=3),
max_session_permit=50,
)
start_time = time.perf_counter()
async with AsyncWebCrawler() as crawler:
result_container = await crawler.arun_many(urls=urls, config=config, dispatcher=dispatcher)
results = []
if isinstance(result_container, list):
results = result_container
else:
async for res in result_container:
results.append(res)
total_time = time.perf_counter() - start_time
return total_time, results
async def crawl_parallel_no_rate_limit(urls):
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=VERBOSE)
# Dispatcher with no rate limiter and a high session permit to avoid queuing
dispatcher = MemoryAdaptiveDispatcher(
rate_limiter=None,
max_session_permit=len(urls) # allow all URLs concurrently
)
start_time = time.perf_counter()
async with AsyncWebCrawler() as crawler:
result_container = await crawler.arun_many(urls=urls, config=config, dispatcher=dispatcher)
results = []
if isinstance(result_container, list):
results = result_container
else:
async for res in result_container:
results.append(res)
total_time = time.perf_counter() - start_time
return total_time, results
async def main():
urls = ["https://example.com"] * 100
print(f"Crawling {len(urls)} URLs sequentially...")
seq_time, seq_results = await crawl_sequential(urls)
print(f"Sequential crawling took: {seq_time:.2f} seconds\n")
print(f"Crawling {len(urls)} URLs in parallel using arun_many with dispatcher (with rate limit)...")
disp_time, disp_results = await crawl_parallel_dispatcher(urls)
print(f"Parallel (dispatcher with rate limiter) took: {disp_time:.2f} seconds\n")
print(f"Crawling {len(urls)} URLs in parallel using dispatcher with no rate limiter...")
no_rl_time, no_rl_results = await crawl_parallel_no_rate_limit(urls)
print(f"Parallel (dispatcher without rate limiter) took: {no_rl_time:.2f} seconds\n")
print("Crawl4ai - Crawling Comparison")
print("--------------------------------------------------------")
print(f"Sequential crawling took: {seq_time:.2f} seconds")
print(f"Parallel (dispatcher with rate limiter) took: {disp_time:.2f} seconds")
print(f"Parallel (dispatcher without rate limiter) took: {no_rl_time:.2f} seconds")
if __name__ == "__main__":
asyncio.run(main())