149 lines
4.1 KiB
Python
149 lines
4.1 KiB
Python
![]() |
import asyncio
|
||
|
import pytest
|
||
|
from typing import List
|
||
|
from crawl4ai import (
|
||
|
AsyncWebCrawler,
|
||
|
BrowserConfig,
|
||
|
CrawlerRunConfig,
|
||
|
MemoryAdaptiveDispatcher,
|
||
|
RateLimiter,
|
||
|
CacheMode
|
||
|
)
|
||
|
|
||
|
@pytest.mark.asyncio
|
||
|
@pytest.mark.parametrize("viewport", [
|
||
|
(800, 600),
|
||
|
(1024, 768),
|
||
|
(1920, 1080)
|
||
|
])
|
||
|
async def test_viewport_config(viewport):
|
||
|
"""Test different viewport configurations"""
|
||
|
width, height = viewport
|
||
|
browser_config = BrowserConfig(
|
||
|
browser_type="chromium",
|
||
|
headless=True,
|
||
|
viewport_width=width,
|
||
|
viewport_height=height
|
||
|
)
|
||
|
|
||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||
|
result = await crawler.arun(
|
||
|
url="https://example.com",
|
||
|
config=CrawlerRunConfig(
|
||
|
# cache_mode=CacheMode.BYPASS,
|
||
|
page_timeout=30000 # 30 seconds
|
||
|
)
|
||
|
)
|
||
|
assert result.success
|
||
|
|
||
|
@pytest.mark.asyncio
|
||
|
async def test_memory_management():
|
||
|
"""Test memory-adaptive dispatching"""
|
||
|
browser_config = BrowserConfig(
|
||
|
browser_type="chromium",
|
||
|
headless=True,
|
||
|
viewport_width=1024,
|
||
|
viewport_height=768
|
||
|
)
|
||
|
|
||
|
dispatcher = MemoryAdaptiveDispatcher(
|
||
|
memory_threshold_percent=70.0,
|
||
|
check_interval=1.0,
|
||
|
max_session_permit=5
|
||
|
)
|
||
|
|
||
|
urls = ["https://example.com"] * 3 # Test with multiple identical URLs
|
||
|
|
||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||
|
results = await crawler.arun_many(
|
||
|
urls=urls,
|
||
|
config=CrawlerRunConfig(page_timeout=30000),
|
||
|
dispatcher=dispatcher
|
||
|
)
|
||
|
assert len(results) == len(urls)
|
||
|
|
||
|
@pytest.mark.asyncio
|
||
|
async def test_rate_limiting():
|
||
|
"""Test rate limiting functionality"""
|
||
|
browser_config = BrowserConfig(
|
||
|
browser_type="chromium",
|
||
|
headless=True
|
||
|
)
|
||
|
|
||
|
dispatcher = MemoryAdaptiveDispatcher(
|
||
|
rate_limiter=RateLimiter(
|
||
|
base_delay=(1.0, 2.0),
|
||
|
max_delay=5.0,
|
||
|
max_retries=2
|
||
|
),
|
||
|
memory_threshold_percent=70.0
|
||
|
)
|
||
|
|
||
|
urls = [
|
||
|
"https://example.com",
|
||
|
"https://example.org",
|
||
|
"https://example.net"
|
||
|
]
|
||
|
|
||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||
|
results = await crawler.arun_many(
|
||
|
urls=urls,
|
||
|
config=CrawlerRunConfig(page_timeout=30000),
|
||
|
dispatcher=dispatcher
|
||
|
)
|
||
|
assert len(results) == len(urls)
|
||
|
|
||
|
@pytest.mark.asyncio
|
||
|
async def test_javascript_execution():
|
||
|
"""Test JavaScript execution capabilities"""
|
||
|
browser_config = BrowserConfig(
|
||
|
browser_type="chromium",
|
||
|
headless=True,
|
||
|
java_script_enabled=True
|
||
|
)
|
||
|
|
||
|
js_code = """
|
||
|
document.body.style.backgroundColor = 'red';
|
||
|
return document.body.style.backgroundColor;
|
||
|
"""
|
||
|
|
||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||
|
result = await crawler.arun(
|
||
|
url="https://example.com",
|
||
|
config=CrawlerRunConfig(
|
||
|
js_code=js_code,
|
||
|
page_timeout=30000
|
||
|
)
|
||
|
)
|
||
|
assert result.success
|
||
|
|
||
|
@pytest.mark.asyncio
|
||
|
@pytest.mark.parametrize("error_url", [
|
||
|
"https://invalid.domain.test",
|
||
|
"https://httpbin.org/status/404",
|
||
|
"https://httpbin.org/status/503",
|
||
|
"https://httpbin.org/status/403"
|
||
|
])
|
||
|
async def test_error_handling(error_url):
|
||
|
"""Test error handling for various failure scenarios"""
|
||
|
browser_config = BrowserConfig(
|
||
|
browser_type="chromium",
|
||
|
headless=True
|
||
|
)
|
||
|
|
||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||
|
result = await crawler.arun(
|
||
|
url=error_url,
|
||
|
config=CrawlerRunConfig(
|
||
|
page_timeout=10000, # Short timeout for error cases
|
||
|
cache_mode=CacheMode.BYPASS
|
||
|
)
|
||
|
)
|
||
|
assert not result.success
|
||
|
assert result.error_message is not None
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
asyncio.run(test_viewport_config((1024, 768)))
|
||
|
asyncio.run(test_memory_management())
|
||
|
asyncio.run(test_rate_limiting())
|
||
|
asyncio.run(test_javascript_execution())
|