crawl4ai/tests/general/test_deep_crawl.py

import asyncio
import time


from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
# from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy


async def main():
    """Example deep crawl of documentation site."""
    config = CrawlerRunConfig(
        deep_crawl_strategy = BFSDeepCrawlStrategy(
            max_depth=2,
            include_external=False
        ),
        stream=False,
        verbose=True,
        cache_mode=CacheMode.BYPASS,
        scraping_strategy=LXMLWebScrapingStrategy()
    )

    async with AsyncWebCrawler() as crawler:
        start_time = time.perf_counter()
        print("\nStarting deep crawl in batch mode:")
        results = await crawler.arun(
            url="https://docs.crawl4ai.com",
            config=config
        )
        print(f"Crawled {len(results)} pages")
        print(f"Example page: {results[0].url}")
        print(f"Duration: {time.perf_counter() - start_time:.2f} seconds\n")

        print("Starting deep crawl in streaming mode:")
        config.stream = True
        start_time = time.perf_counter()
        async for result in await crawler.arun(
            url="https://docs.crawl4ai.com",
            config=config
        ):
            print(f"→ {result.url} (Depth: {result.metadata.get('depth', 0)})")
        print(f"Duration: {time.perf_counter() - start_time:.2f} seconds")

if __name__ == "__main__":
    asyncio.run(main())
refactor(deep-crawl): reorganize deep crawling functionality into dedicated module Restructure deep crawling code into a dedicated module with improved organization: - Move deep crawl logic from async_deep_crawl.py to deep_crawling/ - Create separate files for BFS strategy, filters, and scorers - Improve code organization and maintainability - Add optimized implementations for URL filtering and scoring - Rename DeepCrawlHandler to DeepCrawlDecorator for clarity BREAKING CHANGE: DeepCrawlStrategy and BreadthFirstSearchStrategy imports need to be updated to new package structure 2025-02-04 23:28:17 +08:00			`import asyncio`
			`import time`


refactor(deep-crawling): reorganize deep crawling strategies and add new implementations Split deep crawling code into separate strategy files for better organization and maintainability. Added new BFF (Best First) and DFS crawling strategies. Introduced base strategy class and common types. BREAKING CHANGE: Deep crawling implementation has been split into multiple files. Import paths for deep crawling strategies have changed. 2025-02-05 22:50:39 +08:00			`from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode`
			`from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy`
			`from crawl4ai.deep_crawling import BFSDeepCrawlStrategy`
			`# from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy`
refactor(deep-crawl): reorganize deep crawling functionality into dedicated module Restructure deep crawling code into a dedicated module with improved organization: - Move deep crawl logic from async_deep_crawl.py to deep_crawling/ - Create separate files for BFS strategy, filters, and scorers - Improve code organization and maintainability - Add optimized implementations for URL filtering and scoring - Rename DeepCrawlHandler to DeepCrawlDecorator for clarity BREAKING CHANGE: DeepCrawlStrategy and BreadthFirstSearchStrategy imports need to be updated to new package structure 2025-02-04 23:28:17 +08:00

			`async def main():`
			`"""Example deep crawl of documentation site."""`
			`config = CrawlerRunConfig(`
			`deep_crawl_strategy = BFSDeepCrawlStrategy(`
			`max_depth=2,`
			`include_external=False`
			`),`
			`stream=False,`
			`verbose=True,`
refactor(deep-crawling): reorganize deep crawling strategies and add new implementations Split deep crawling code into separate strategy files for better organization and maintainability. Added new BFF (Best First) and DFS crawling strategies. Introduced base strategy class and common types. BREAKING CHANGE: Deep crawling implementation has been split into multiple files. Import paths for deep crawling strategies have changed. 2025-02-05 22:50:39 +08:00			`cache_mode=CacheMode.BYPASS,`
			`scraping_strategy=LXMLWebScrapingStrategy()`
refactor(deep-crawl): reorganize deep crawling functionality into dedicated module Restructure deep crawling code into a dedicated module with improved organization: - Move deep crawl logic from async_deep_crawl.py to deep_crawling/ - Create separate files for BFS strategy, filters, and scorers - Improve code organization and maintainability - Add optimized implementations for URL filtering and scoring - Rename DeepCrawlHandler to DeepCrawlDecorator for clarity BREAKING CHANGE: DeepCrawlStrategy and BreadthFirstSearchStrategy imports need to be updated to new package structure 2025-02-04 23:28:17 +08:00			`)`

			`async with AsyncWebCrawler() as crawler:`
			`start_time = time.perf_counter()`
			`print("\nStarting deep crawl in batch mode:")`
			`results = await crawler.arun(`
			`url="https://docs.crawl4ai.com",`
			`config=config`
			`)`
			`print(f"Crawled {len(results)} pages")`
			`print(f"Example page: {results[0].url}")`
			`print(f"Duration: {time.perf_counter() - start_time:.2f} seconds\n")`

			`print("Starting deep crawl in streaming mode:")`
			`config.stream = True`
			`start_time = time.perf_counter()`
			`async for result in await crawler.arun(`
			`url="https://docs.crawl4ai.com",`
			`config=config`
			`):`
			`print(f"→ {result.url} (Depth: {result.metadata.get('depth', 0)})")`
			`print(f"Duration: {time.perf_counter() - start_time:.2f} seconds")`

			`if __name__ == "__main__":`
			`asyncio.run(main())`