crawl4ai/tests/general/test_async_webcrawler.py
unclecode 66ac07b4f3 feat(crawler): add network request and console message capturing
Implement comprehensive network request and console message capturing functionality:
- Add capture_network_requests and capture_console_messages config parameters
- Add network_requests and console_messages fields to models
- Implement Playwright event listeners to capture requests, responses, and console output
- Create detailed documentation and examples
- Add comprehensive tests

This feature enables deep visibility into web page activity for debugging,
security analysis, performance profiling, and API discovery in web applications.
2025-04-10 16:03:48 +08:00

149 lines
4.1 KiB
Python

import asyncio
import pytest
from typing import List
from crawl4ai import (
AsyncWebCrawler,
BrowserConfig,
CrawlerRunConfig,
MemoryAdaptiveDispatcher,
RateLimiter,
CacheMode
)
@pytest.mark.asyncio
@pytest.mark.parametrize("viewport", [
(800, 600),
(1024, 768),
(1920, 1080)
])
async def test_viewport_config(viewport):
"""Test different viewport configurations"""
width, height = viewport
browser_config = BrowserConfig(
browser_type="chromium",
headless=True,
viewport_width=width,
viewport_height=height
)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url="https://example.com",
config=CrawlerRunConfig(
# cache_mode=CacheMode.BYPASS,
page_timeout=30000 # 30 seconds
)
)
assert result.success
@pytest.mark.asyncio
async def test_memory_management():
"""Test memory-adaptive dispatching"""
browser_config = BrowserConfig(
browser_type="chromium",
headless=True,
viewport_width=1024,
viewport_height=768
)
dispatcher = MemoryAdaptiveDispatcher(
memory_threshold_percent=70.0,
check_interval=1.0,
max_session_permit=5
)
urls = ["https://example.com"] * 3 # Test with multiple identical URLs
async with AsyncWebCrawler(config=browser_config) as crawler:
results = await crawler.arun_many(
urls=urls,
config=CrawlerRunConfig(page_timeout=30000),
dispatcher=dispatcher
)
assert len(results) == len(urls)
@pytest.mark.asyncio
async def test_rate_limiting():
"""Test rate limiting functionality"""
browser_config = BrowserConfig(
browser_type="chromium",
headless=True
)
dispatcher = MemoryAdaptiveDispatcher(
rate_limiter=RateLimiter(
base_delay=(1.0, 2.0),
max_delay=5.0,
max_retries=2
),
memory_threshold_percent=70.0
)
urls = [
"https://example.com",
"https://example.org",
"https://example.net"
]
async with AsyncWebCrawler(config=browser_config) as crawler:
results = await crawler.arun_many(
urls=urls,
config=CrawlerRunConfig(page_timeout=30000),
dispatcher=dispatcher
)
assert len(results) == len(urls)
@pytest.mark.asyncio
async def test_javascript_execution():
"""Test JavaScript execution capabilities"""
browser_config = BrowserConfig(
browser_type="chromium",
headless=True,
java_script_enabled=True
)
js_code = """
document.body.style.backgroundColor = 'red';
return document.body.style.backgroundColor;
"""
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url="https://example.com",
config=CrawlerRunConfig(
js_code=js_code,
page_timeout=30000
)
)
assert result.success
@pytest.mark.asyncio
@pytest.mark.parametrize("error_url", [
"https://invalid.domain.test",
"https://httpbin.org/status/404",
"https://httpbin.org/status/503",
"https://httpbin.org/status/403"
])
async def test_error_handling(error_url):
"""Test error handling for various failure scenarios"""
browser_config = BrowserConfig(
browser_type="chromium",
headless=True
)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url=error_url,
config=CrawlerRunConfig(
page_timeout=10000, # Short timeout for error cases
cache_mode=CacheMode.BYPASS
)
)
assert not result.success
assert result.error_message is not None
if __name__ == "__main__":
asyncio.run(test_viewport_config((1024, 768)))
asyncio.run(test_memory_management())
asyncio.run(test_rate_limiting())
asyncio.run(test_javascript_execution())