crawl4ai/tests/general/test_deep_crawl.py
unclecode 66ac07b4f3 feat(crawler): add network request and console message capturing
Implement comprehensive network request and console message capturing functionality:
- Add capture_network_requests and capture_console_messages config parameters
- Add network_requests and console_messages fields to models
- Implement Playwright event listeners to capture requests, responses, and console output
- Create detailed documentation and examples
- Add comprehensive tests

This feature enables deep visibility into web page activity for debugging,
security analysis, performance profiling, and API discovery in web applications.
2025-04-10 16:03:48 +08:00

46 lines
1.5 KiB
Python

import asyncio
import time
from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
# from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
async def main():
"""Example deep crawl of documentation site."""
config = CrawlerRunConfig(
deep_crawl_strategy = BFSDeepCrawlStrategy(
max_depth=2,
include_external=False
),
stream=False,
verbose=True,
cache_mode=CacheMode.BYPASS,
scraping_strategy=LXMLWebScrapingStrategy()
)
async with AsyncWebCrawler() as crawler:
start_time = time.perf_counter()
print("\nStarting deep crawl in batch mode:")
results = await crawler.arun(
url="https://docs.crawl4ai.com",
config=config
)
print(f"Crawled {len(results)} pages")
print(f"Example page: {results[0].url}")
print(f"Duration: {time.perf_counter() - start_time:.2f} seconds\n")
print("Starting deep crawl in streaming mode:")
config.stream = True
start_time = time.perf_counter()
async for result in await crawler.arun(
url="https://docs.crawl4ai.com",
config=config
):
print(f"{result.url} (Depth: {result.metadata.get('depth', 0)})")
print(f"Duration: {time.perf_counter() - start_time:.2f} seconds")
if __name__ == "__main__":
asyncio.run(main())