crawl4ai/tests/general/test_network_console_capture.py
unclecode 66ac07b4f3 feat(crawler): add network request and console message capturing
Implement comprehensive network request and console message capturing functionality:
- Add capture_network_requests and capture_console_messages config parameters
- Add network_requests and console_messages fields to models
- Implement Playwright event listeners to capture requests, responses, and console output
- Create detailed documentation and examples
- Add comprehensive tests

This feature enables deep visibility into web page activity for debugging,
security analysis, performance profiling, and API discovery in web applications.
2025-04-10 16:03:48 +08:00

185 lines
8.0 KiB
Python

from crawl4ai.async_webcrawler import AsyncWebCrawler
from crawl4ai.async_configs import CrawlerRunConfig, BrowserConfig
import asyncio
import aiohttp
from aiohttp import web
import tempfile
import shutil
import os, sys, time, json
async def start_test_server():
app = web.Application()
async def basic_page(request):
return web.Response(text="""
<!DOCTYPE html>
<html>
<head>
<title>Network Request Test</title>
</head>
<body>
<h1>Test Page for Network Capture</h1>
<p>This page performs network requests and console logging.</p>
<img src="/image.png" alt="Test Image">
<script>
console.log("Basic console log");
console.error("Error message");
console.warn("Warning message");
// Make some XHR requests
const xhr = new XMLHttpRequest();
xhr.open('GET', '/api/data', true);
xhr.send();
// Make a fetch request
fetch('/api/json')
.then(response => response.json())
.catch(error => console.error('Fetch error:', error));
// Trigger an error
setTimeout(() => {
try {
nonExistentFunction();
} catch (e) {
console.error("Caught error:", e);
}
}, 100);
</script>
</body>
</html>
""", content_type="text/html")
async def image(request):
# Return a small 1x1 transparent PNG
return web.Response(body=bytes.fromhex('89504E470D0A1A0A0000000D49484452000000010000000108060000001F15C4890000000D4944415478DA63FAFFFF3F030079DB00018D959DE70000000049454E44AE426082'), content_type="image/png")
async def api_data(request):
return web.Response(text="sample data")
async def api_json(request):
return web.json_response({"status": "success", "message": "JSON data"})
# Register routes
app.router.add_get('/', basic_page)
app.router.add_get('/image.png', image)
app.router.add_get('/api/data', api_data)
app.router.add_get('/api/json', api_json)
runner = web.AppRunner(app)
await runner.setup()
site = web.TCPSite(runner, 'localhost', 8080)
await site.start()
return runner
async def test_network_console_capture():
print("\n=== Testing Network and Console Capture ===\n")
# Start test server
runner = await start_test_server()
try:
browser_config = BrowserConfig(headless=True)
# Test with capture disabled (default)
print("\n1. Testing with capture disabled (default)...")
async with AsyncWebCrawler(config=browser_config) as crawler:
config = CrawlerRunConfig(
wait_until="networkidle", # Wait for network to be idle
)
result = await crawler.arun(url="http://localhost:8080/", config=config)
assert result.network_requests is None, "Network requests should be None when capture is disabled"
assert result.console_messages is None, "Console messages should be None when capture is disabled"
print("✓ Default config correctly returns None for network_requests and console_messages")
# Test with network capture enabled
print("\n2. Testing with network capture enabled...")
async with AsyncWebCrawler(config=browser_config) as crawler:
config = CrawlerRunConfig(
wait_until="networkidle", # Wait for network to be idle
capture_network_requests=True
)
result = await crawler.arun(url="http://localhost:8080/", config=config)
assert result.network_requests is not None, "Network requests should be captured"
print(f"✓ Captured {len(result.network_requests)} network requests")
# Check if we have both requests and responses
request_count = len([r for r in result.network_requests if r.get("event_type") == "request"])
response_count = len([r for r in result.network_requests if r.get("event_type") == "response"])
print(f" - {request_count} requests, {response_count} responses")
# Check if we captured specific resources
urls = [r.get("url") for r in result.network_requests]
has_image = any("/image.png" in url for url in urls)
has_api_data = any("/api/data" in url for url in urls)
has_api_json = any("/api/json" in url for url in urls)
assert has_image, "Should have captured image request"
assert has_api_data, "Should have captured API data request"
assert has_api_json, "Should have captured API JSON request"
print("✓ Captured expected network requests (image, API endpoints)")
# Test with console capture enabled
print("\n3. Testing with console capture enabled...")
async with AsyncWebCrawler(config=browser_config) as crawler:
config = CrawlerRunConfig(
wait_until="networkidle", # Wait for network to be idle
capture_console_messages=True
)
result = await crawler.arun(url="http://localhost:8080/", config=config)
assert result.console_messages is not None, "Console messages should be captured"
print(f"✓ Captured {len(result.console_messages)} console messages")
# Check if we have different types of console messages
message_types = set(msg.get("type") for msg in result.console_messages if "type" in msg)
print(f" - Message types: {', '.join(message_types)}")
# Print all captured messages for debugging
print(" - Captured messages:")
for msg in result.console_messages:
print(f" * Type: {msg.get('type', 'N/A')}, Text: {msg.get('text', 'N/A')}")
# Look for specific messages
messages = [msg.get("text") for msg in result.console_messages if "text" in msg]
has_basic_log = any("Basic console log" in msg for msg in messages)
has_error_msg = any("Error message" in msg for msg in messages)
has_warning_msg = any("Warning message" in msg for msg in messages)
assert has_basic_log, "Should have captured basic console.log message"
assert has_error_msg, "Should have captured console.error message"
assert has_warning_msg, "Should have captured console.warn message"
print("✓ Captured expected console messages (log, error, warning)")
# Test with both captures enabled
print("\n4. Testing with both network and console capture enabled...")
async with AsyncWebCrawler(config=browser_config) as crawler:
config = CrawlerRunConfig(
wait_until="networkidle", # Wait for network to be idle
capture_network_requests=True,
capture_console_messages=True
)
result = await crawler.arun(url="http://localhost:8080/", config=config)
assert result.network_requests is not None, "Network requests should be captured"
assert result.console_messages is not None, "Console messages should be captured"
print(f"✓ Successfully captured both {len(result.network_requests)} network requests and {len(result.console_messages)} console messages")
finally:
await runner.cleanup()
print("\nTest server shutdown")
async def main():
try:
await test_network_console_capture()
print("\n✅ All tests passed successfully!")
except Exception as e:
print(f"\n❌ Test failed: {str(e)}")
raise
if __name__ == "__main__":
asyncio.run(main())