mirror of
https://github.com/unclecode/crawl4ai.git
synced 2025-11-11 09:17:32 +00:00
Implement comprehensive network request and console message capturing functionality: - Add capture_network_requests and capture_console_messages config parameters - Add network_requests and console_messages fields to models - Implement Playwright event listeners to capture requests, responses, and console output - Create detailed documentation and examples - Add comprehensive tests This feature enables deep visibility into web page activity for debugging, security analysis, performance profiling, and API discovery in web applications.
185 lines
8.0 KiB
Python
185 lines
8.0 KiB
Python
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
|
from crawl4ai.async_configs import CrawlerRunConfig, BrowserConfig
|
|
import asyncio
|
|
import aiohttp
|
|
from aiohttp import web
|
|
import tempfile
|
|
import shutil
|
|
import os, sys, time, json
|
|
|
|
|
|
async def start_test_server():
|
|
app = web.Application()
|
|
|
|
async def basic_page(request):
|
|
return web.Response(text="""
|
|
<!DOCTYPE html>
|
|
<html>
|
|
<head>
|
|
<title>Network Request Test</title>
|
|
</head>
|
|
<body>
|
|
<h1>Test Page for Network Capture</h1>
|
|
<p>This page performs network requests and console logging.</p>
|
|
<img src="/image.png" alt="Test Image">
|
|
<script>
|
|
console.log("Basic console log");
|
|
console.error("Error message");
|
|
console.warn("Warning message");
|
|
|
|
// Make some XHR requests
|
|
const xhr = new XMLHttpRequest();
|
|
xhr.open('GET', '/api/data', true);
|
|
xhr.send();
|
|
|
|
// Make a fetch request
|
|
fetch('/api/json')
|
|
.then(response => response.json())
|
|
.catch(error => console.error('Fetch error:', error));
|
|
|
|
// Trigger an error
|
|
setTimeout(() => {
|
|
try {
|
|
nonExistentFunction();
|
|
} catch (e) {
|
|
console.error("Caught error:", e);
|
|
}
|
|
}, 100);
|
|
</script>
|
|
</body>
|
|
</html>
|
|
""", content_type="text/html")
|
|
|
|
async def image(request):
|
|
# Return a small 1x1 transparent PNG
|
|
return web.Response(body=bytes.fromhex('89504E470D0A1A0A0000000D49484452000000010000000108060000001F15C4890000000D4944415478DA63FAFFFF3F030079DB00018D959DE70000000049454E44AE426082'), content_type="image/png")
|
|
|
|
async def api_data(request):
|
|
return web.Response(text="sample data")
|
|
|
|
async def api_json(request):
|
|
return web.json_response({"status": "success", "message": "JSON data"})
|
|
|
|
# Register routes
|
|
app.router.add_get('/', basic_page)
|
|
app.router.add_get('/image.png', image)
|
|
app.router.add_get('/api/data', api_data)
|
|
app.router.add_get('/api/json', api_json)
|
|
|
|
runner = web.AppRunner(app)
|
|
await runner.setup()
|
|
site = web.TCPSite(runner, 'localhost', 8080)
|
|
await site.start()
|
|
|
|
return runner
|
|
|
|
|
|
async def test_network_console_capture():
|
|
print("\n=== Testing Network and Console Capture ===\n")
|
|
|
|
# Start test server
|
|
runner = await start_test_server()
|
|
try:
|
|
browser_config = BrowserConfig(headless=True)
|
|
|
|
# Test with capture disabled (default)
|
|
print("\n1. Testing with capture disabled (default)...")
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
config = CrawlerRunConfig(
|
|
wait_until="networkidle", # Wait for network to be idle
|
|
)
|
|
result = await crawler.arun(url="http://localhost:8080/", config=config)
|
|
|
|
assert result.network_requests is None, "Network requests should be None when capture is disabled"
|
|
assert result.console_messages is None, "Console messages should be None when capture is disabled"
|
|
print("✓ Default config correctly returns None for network_requests and console_messages")
|
|
|
|
# Test with network capture enabled
|
|
print("\n2. Testing with network capture enabled...")
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
config = CrawlerRunConfig(
|
|
wait_until="networkidle", # Wait for network to be idle
|
|
capture_network_requests=True
|
|
)
|
|
result = await crawler.arun(url="http://localhost:8080/", config=config)
|
|
|
|
assert result.network_requests is not None, "Network requests should be captured"
|
|
print(f"✓ Captured {len(result.network_requests)} network requests")
|
|
|
|
# Check if we have both requests and responses
|
|
request_count = len([r for r in result.network_requests if r.get("event_type") == "request"])
|
|
response_count = len([r for r in result.network_requests if r.get("event_type") == "response"])
|
|
print(f" - {request_count} requests, {response_count} responses")
|
|
|
|
# Check if we captured specific resources
|
|
urls = [r.get("url") for r in result.network_requests]
|
|
has_image = any("/image.png" in url for url in urls)
|
|
has_api_data = any("/api/data" in url for url in urls)
|
|
has_api_json = any("/api/json" in url for url in urls)
|
|
|
|
assert has_image, "Should have captured image request"
|
|
assert has_api_data, "Should have captured API data request"
|
|
assert has_api_json, "Should have captured API JSON request"
|
|
print("✓ Captured expected network requests (image, API endpoints)")
|
|
|
|
# Test with console capture enabled
|
|
print("\n3. Testing with console capture enabled...")
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
config = CrawlerRunConfig(
|
|
wait_until="networkidle", # Wait for network to be idle
|
|
capture_console_messages=True
|
|
)
|
|
result = await crawler.arun(url="http://localhost:8080/", config=config)
|
|
|
|
assert result.console_messages is not None, "Console messages should be captured"
|
|
print(f"✓ Captured {len(result.console_messages)} console messages")
|
|
|
|
# Check if we have different types of console messages
|
|
message_types = set(msg.get("type") for msg in result.console_messages if "type" in msg)
|
|
print(f" - Message types: {', '.join(message_types)}")
|
|
|
|
# Print all captured messages for debugging
|
|
print(" - Captured messages:")
|
|
for msg in result.console_messages:
|
|
print(f" * Type: {msg.get('type', 'N/A')}, Text: {msg.get('text', 'N/A')}")
|
|
|
|
# Look for specific messages
|
|
messages = [msg.get("text") for msg in result.console_messages if "text" in msg]
|
|
has_basic_log = any("Basic console log" in msg for msg in messages)
|
|
has_error_msg = any("Error message" in msg for msg in messages)
|
|
has_warning_msg = any("Warning message" in msg for msg in messages)
|
|
|
|
assert has_basic_log, "Should have captured basic console.log message"
|
|
assert has_error_msg, "Should have captured console.error message"
|
|
assert has_warning_msg, "Should have captured console.warn message"
|
|
print("✓ Captured expected console messages (log, error, warning)")
|
|
|
|
# Test with both captures enabled
|
|
print("\n4. Testing with both network and console capture enabled...")
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
config = CrawlerRunConfig(
|
|
wait_until="networkidle", # Wait for network to be idle
|
|
capture_network_requests=True,
|
|
capture_console_messages=True
|
|
)
|
|
result = await crawler.arun(url="http://localhost:8080/", config=config)
|
|
|
|
assert result.network_requests is not None, "Network requests should be captured"
|
|
assert result.console_messages is not None, "Console messages should be captured"
|
|
print(f"✓ Successfully captured both {len(result.network_requests)} network requests and {len(result.console_messages)} console messages")
|
|
|
|
finally:
|
|
await runner.cleanup()
|
|
print("\nTest server shutdown")
|
|
|
|
|
|
async def main():
|
|
try:
|
|
await test_network_console_capture()
|
|
print("\n✅ All tests passed successfully!")
|
|
except Exception as e:
|
|
print(f"\n❌ Test failed: {str(e)}")
|
|
raise
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main()) |