crawl4ai/tests/general/test_stream.py
unclecode 66ac07b4f3 feat(crawler): add network request and console message capturing
Implement comprehensive network request and console message capturing functionality:
- Add capture_network_requests and capture_console_messages config parameters
- Add network_requests and console_messages fields to models
- Implement Playwright event listeners to capture requests, responses, and console output
- Create detailed documentation and examples
- Add comprehensive tests

This feature enables deep visibility into web page activity for debugging,
security analysis, performance profiling, and API discovery in web applications.
2025-04-10 16:03:48 +08:00

50 lines
1.6 KiB
Python

import os, sys
# append 2 parent directories to sys.path to import crawl4ai
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(parent_dir)
parent_parent_dir = os.path.dirname(parent_dir)
sys.path.append(parent_parent_dir)
import asyncio
from crawl4ai import *
async def test_crawler():
# Setup configurations
browser_config = BrowserConfig(headless=True, verbose=False)
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
markdown_generator=DefaultMarkdownGenerator(
content_filter=PruningContentFilter(
threshold=0.48,
threshold_type="fixed",
min_word_threshold=0
)
),
)
# Test URLs - mix of different sites
urls = [
"http://example.com",
"http://example.org",
"http://example.net",
] * 10 # 15 total URLs
async with AsyncWebCrawler(config=browser_config) as crawler:
print("\n=== Testing Streaming Mode ===")
async for result in await crawler.arun_many(
urls=urls,
config=crawler_config.clone(stream=True),
):
print(f"Received result for: {result.url} - Success: {result.success}")
print("\n=== Testing Batch Mode ===")
results = await crawler.arun_many(
urls=urls,
config=crawler_config,
)
print(f"Received all {len(results)} results at once")
for result in results:
print(f"Batch result for: {result.url} - Success: {result.success}")
if __name__ == "__main__":
asyncio.run(test_crawler())