2025-01-11 21:10:27 +08:00
|
|
|
import asyncio
|
|
|
|
import time
|
|
|
|
from rich import print
|
|
|
|
from rich.table import Table
|
|
|
|
from crawl4ai import (
|
2025-01-13 19:19:58 +08:00
|
|
|
AsyncWebCrawler,
|
|
|
|
BrowserConfig,
|
|
|
|
CrawlerRunConfig,
|
|
|
|
MemoryAdaptiveDispatcher,
|
|
|
|
SemaphoreDispatcher,
|
|
|
|
RateLimiter,
|
|
|
|
CrawlerMonitor,
|
|
|
|
DisplayMode,
|
|
|
|
CacheMode,
|
2025-01-21 21:03:11 +08:00
|
|
|
LXMLWebScrapingStrategy,
|
2025-01-11 21:10:27 +08:00
|
|
|
)
|
2025-01-10 16:01:18 +08:00
|
|
|
|
2025-01-13 19:19:58 +08:00
|
|
|
|
2025-01-11 21:10:27 +08:00
|
|
|
async def memory_adaptive(urls, browser_config, run_config):
|
|
|
|
"""Memory adaptive crawler with monitoring"""
|
|
|
|
start = time.perf_counter()
|
2025-01-10 16:01:18 +08:00
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
2025-01-11 21:10:27 +08:00
|
|
|
dispatcher = MemoryAdaptiveDispatcher(
|
|
|
|
memory_threshold_percent=70.0,
|
|
|
|
max_session_permit=10,
|
|
|
|
monitor=CrawlerMonitor(
|
2025-01-13 19:19:58 +08:00
|
|
|
max_visible_rows=15, display_mode=DisplayMode.DETAILED
|
|
|
|
),
|
|
|
|
)
|
|
|
|
results = await crawler.arun_many(
|
|
|
|
urls, config=run_config, dispatcher=dispatcher
|
2025-01-11 21:10:27 +08:00
|
|
|
)
|
|
|
|
duration = time.perf_counter() - start
|
|
|
|
return len(results), duration
|
2025-01-10 16:01:18 +08:00
|
|
|
|
2025-01-13 19:19:58 +08:00
|
|
|
|
2025-01-11 21:10:27 +08:00
|
|
|
async def memory_adaptive_with_rate_limit(urls, browser_config, run_config):
|
|
|
|
"""Memory adaptive crawler with rate limiting"""
|
2025-01-10 16:01:18 +08:00
|
|
|
start = time.perf_counter()
|
2025-01-11 21:10:27 +08:00
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
|
|
dispatcher = MemoryAdaptiveDispatcher(
|
2025-03-10 18:54:51 +08:00
|
|
|
memory_threshold_percent=95.0,
|
2025-01-11 21:10:27 +08:00
|
|
|
max_session_permit=10,
|
|
|
|
rate_limiter=RateLimiter(
|
2025-01-13 19:19:58 +08:00
|
|
|
base_delay=(1.0, 2.0), max_delay=30.0, max_retries=2
|
2025-01-11 21:10:27 +08:00
|
|
|
),
|
|
|
|
monitor=CrawlerMonitor(
|
2025-01-13 19:19:58 +08:00
|
|
|
max_visible_rows=15, display_mode=DisplayMode.DETAILED
|
|
|
|
),
|
|
|
|
)
|
|
|
|
results = await crawler.arun_many(
|
|
|
|
urls, config=run_config, dispatcher=dispatcher
|
2025-01-11 21:10:27 +08:00
|
|
|
)
|
|
|
|
duration = time.perf_counter() - start
|
|
|
|
return len(results), duration
|
|
|
|
|
2025-01-13 19:19:58 +08:00
|
|
|
|
2025-01-11 21:10:27 +08:00
|
|
|
async def semaphore(urls, browser_config, run_config):
|
|
|
|
"""Basic semaphore crawler"""
|
|
|
|
start = time.perf_counter()
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
|
|
dispatcher = SemaphoreDispatcher(
|
|
|
|
semaphore_count=5,
|
|
|
|
monitor=CrawlerMonitor(
|
2025-01-13 19:19:58 +08:00
|
|
|
max_visible_rows=15, display_mode=DisplayMode.DETAILED
|
|
|
|
),
|
|
|
|
)
|
|
|
|
results = await crawler.arun_many(
|
|
|
|
urls, config=run_config, dispatcher=dispatcher
|
2025-01-11 21:10:27 +08:00
|
|
|
)
|
|
|
|
duration = time.perf_counter() - start
|
|
|
|
return len(results), duration
|
|
|
|
|
2025-01-13 19:19:58 +08:00
|
|
|
|
2025-01-11 21:10:27 +08:00
|
|
|
async def semaphore_with_rate_limit(urls, browser_config, run_config):
|
|
|
|
"""Semaphore crawler with rate limiting"""
|
|
|
|
start = time.perf_counter()
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
|
|
dispatcher = SemaphoreDispatcher(
|
|
|
|
semaphore_count=5,
|
|
|
|
rate_limiter=RateLimiter(
|
2025-01-13 19:19:58 +08:00
|
|
|
base_delay=(1.0, 2.0), max_delay=30.0, max_retries=2
|
2025-01-11 21:10:27 +08:00
|
|
|
),
|
|
|
|
monitor=CrawlerMonitor(
|
2025-01-13 19:19:58 +08:00
|
|
|
max_visible_rows=15, display_mode=DisplayMode.DETAILED
|
|
|
|
),
|
|
|
|
)
|
|
|
|
results = await crawler.arun_many(
|
|
|
|
urls, config=run_config, dispatcher=dispatcher
|
2025-01-11 21:10:27 +08:00
|
|
|
)
|
|
|
|
duration = time.perf_counter() - start
|
|
|
|
return len(results), duration
|
|
|
|
|
2025-01-13 19:19:58 +08:00
|
|
|
|
2025-01-11 21:10:27 +08:00
|
|
|
def create_performance_table(results):
|
|
|
|
"""Creates a rich table showing performance results"""
|
|
|
|
table = Table(title="Crawler Strategy Performance Comparison")
|
|
|
|
table.add_column("Strategy", style="cyan")
|
|
|
|
table.add_column("URLs Crawled", justify="right", style="green")
|
|
|
|
table.add_column("Time (seconds)", justify="right", style="yellow")
|
|
|
|
table.add_column("URLs/second", justify="right", style="magenta")
|
|
|
|
|
|
|
|
sorted_results = sorted(results.items(), key=lambda x: x[1][1])
|
2025-01-13 19:19:58 +08:00
|
|
|
|
2025-01-11 21:10:27 +08:00
|
|
|
for strategy, (urls_crawled, duration) in sorted_results:
|
|
|
|
urls_per_second = urls_crawled / duration
|
|
|
|
table.add_row(
|
2025-01-13 19:19:58 +08:00
|
|
|
strategy, str(urls_crawled), f"{duration:.2f}", f"{urls_per_second:.2f}"
|
2025-01-11 21:10:27 +08:00
|
|
|
)
|
2025-01-13 19:19:58 +08:00
|
|
|
|
2025-01-11 21:10:27 +08:00
|
|
|
return table
|
|
|
|
|
2025-01-13 19:19:58 +08:00
|
|
|
|
2025-01-11 21:10:27 +08:00
|
|
|
async def main():
|
2025-01-22 20:40:03 +08:00
|
|
|
urls = [f"https://example.com/page{i}" for i in range(1, 40)]
|
2025-01-11 21:10:27 +08:00
|
|
|
browser_config = BrowserConfig(headless=True, verbose=False)
|
2025-01-21 21:03:11 +08:00
|
|
|
run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, scraping_strategy=LXMLWebScrapingStrategy())
|
2025-01-11 21:10:27 +08:00
|
|
|
|
|
|
|
results = {
|
|
|
|
"Memory Adaptive": await memory_adaptive(urls, browser_config, run_config),
|
2025-01-22 20:40:03 +08:00
|
|
|
# "Memory Adaptive + Rate Limit": await memory_adaptive_with_rate_limit(
|
|
|
|
# urls, browser_config, run_config
|
|
|
|
# ),
|
|
|
|
# "Semaphore": await semaphore(urls, browser_config, run_config),
|
|
|
|
# "Semaphore + Rate Limit": await semaphore_with_rate_limit(
|
|
|
|
# urls, browser_config, run_config
|
|
|
|
# ),
|
2025-01-11 21:10:27 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
table = create_performance_table(results)
|
|
|
|
print("\nPerformance Summary:")
|
|
|
|
print(table)
|
2025-01-10 16:01:18 +08:00
|
|
|
|
2025-01-13 19:19:58 +08:00
|
|
|
|
2025-01-10 16:01:18 +08:00
|
|
|
if __name__ == "__main__":
|
2025-01-13 19:19:58 +08:00
|
|
|
asyncio.run(main())
|