
Add special handling for single URL requests in Docker API to use arun() instead of arun_many() Add new example script demonstrating performance differences between sequential and parallel crawling Update cache mode from aggressive to bypass in examples and tests Remove unused dependencies (zstandard, msgpack) BREAKING CHANGE: Changed default cache_mode from aggressive to bypass in examples
80 lines
3.4 KiB
Python
80 lines
3.4 KiB
Python
import asyncio
|
|
import time
|
|
from crawl4ai.async_webcrawler import AsyncWebCrawler, CacheMode
|
|
from crawl4ai.async_configs import CrawlerRunConfig
|
|
from crawl4ai.async_dispatcher import MemoryAdaptiveDispatcher, RateLimiter
|
|
|
|
VERBOSE = False
|
|
|
|
async def crawl_sequential(urls):
|
|
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=VERBOSE)
|
|
results = []
|
|
start_time = time.perf_counter()
|
|
async with AsyncWebCrawler() as crawler:
|
|
for url in urls:
|
|
result_container = await crawler.arun(url=url, config=config)
|
|
results.append(result_container[0])
|
|
total_time = time.perf_counter() - start_time
|
|
return total_time, results
|
|
|
|
async def crawl_parallel_dispatcher(urls):
|
|
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=VERBOSE)
|
|
# Dispatcher with rate limiter enabled (default behavior)
|
|
dispatcher = MemoryAdaptiveDispatcher(
|
|
rate_limiter=RateLimiter(base_delay=(1.0, 3.0), max_delay=60.0, max_retries=3),
|
|
max_session_permit=50,
|
|
)
|
|
start_time = time.perf_counter()
|
|
async with AsyncWebCrawler() as crawler:
|
|
result_container = await crawler.arun_many(urls=urls, config=config, dispatcher=dispatcher)
|
|
results = []
|
|
if isinstance(result_container, list):
|
|
results = result_container
|
|
else:
|
|
async for res in result_container:
|
|
results.append(res)
|
|
total_time = time.perf_counter() - start_time
|
|
return total_time, results
|
|
|
|
async def crawl_parallel_no_rate_limit(urls):
|
|
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=VERBOSE)
|
|
# Dispatcher with no rate limiter and a high session permit to avoid queuing
|
|
dispatcher = MemoryAdaptiveDispatcher(
|
|
rate_limiter=None,
|
|
max_session_permit=len(urls) # allow all URLs concurrently
|
|
)
|
|
start_time = time.perf_counter()
|
|
async with AsyncWebCrawler() as crawler:
|
|
result_container = await crawler.arun_many(urls=urls, config=config, dispatcher=dispatcher)
|
|
results = []
|
|
if isinstance(result_container, list):
|
|
results = result_container
|
|
else:
|
|
async for res in result_container:
|
|
results.append(res)
|
|
total_time = time.perf_counter() - start_time
|
|
return total_time, results
|
|
|
|
async def main():
|
|
urls = ["https://example.com"] * 100
|
|
print(f"Crawling {len(urls)} URLs sequentially...")
|
|
seq_time, seq_results = await crawl_sequential(urls)
|
|
print(f"Sequential crawling took: {seq_time:.2f} seconds\n")
|
|
|
|
print(f"Crawling {len(urls)} URLs in parallel using arun_many with dispatcher (with rate limit)...")
|
|
disp_time, disp_results = await crawl_parallel_dispatcher(urls)
|
|
print(f"Parallel (dispatcher with rate limiter) took: {disp_time:.2f} seconds\n")
|
|
|
|
print(f"Crawling {len(urls)} URLs in parallel using dispatcher with no rate limiter...")
|
|
no_rl_time, no_rl_results = await crawl_parallel_no_rate_limit(urls)
|
|
print(f"Parallel (dispatcher without rate limiter) took: {no_rl_time:.2f} seconds\n")
|
|
|
|
print("Crawl4ai - Crawling Comparison")
|
|
print("--------------------------------------------------------")
|
|
print(f"Sequential crawling took: {seq_time:.2f} seconds")
|
|
print(f"Parallel (dispatcher with rate limiter) took: {disp_time:.2f} seconds")
|
|
print(f"Parallel (dispatcher without rate limiter) took: {no_rl_time:.2f} seconds")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|