crawl4ai/docs/examples/deepcrawl.py

import asyncio
import time

from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
from crawl4ai.deep_crawling.filters import (
    FilterChain,
    URLPatternFilter,
    DomainFilter,
    ContentTypeFilter,
    ContentRelevanceFilter,
    SEOFilter,
)
from crawl4ai.deep_crawling.scorers import (
    KeywordRelevanceScorer,
)


# 1️⃣ Basic Deep Crawl Setup
async def basic_deep_crawl():
    """
    PART 1: Basic Deep Crawl setup - Demonstrates a simple two-level deep crawl.

    This function shows:
    - How to set up BFSDeepCrawlStrategy (Breadth-First Search)
    - Setting depth and domain parameters
    - Processing the results to show the hierarchy
    """
    print("\n===== BASIC DEEP CRAWL SETUP =====")

    # Configure a 2-level deep crawl using Breadth-First Search strategy
    # max_depth=2 means: initial page (depth 0) + 2 more levels
    # include_external=False means: only follow links within the same domain
    config = CrawlerRunConfig(
        deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=2, include_external=False),
        scraping_strategy=LXMLWebScrapingStrategy(),
        verbose=True,  # Show progress during crawling
    )

    async with AsyncWebCrawler() as crawler:
        start_time = time.perf_counter()
        results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)

        # Group results by depth to visualize the crawl tree
        pages_by_depth = {}
        for result in results:
            depth = result.metadata.get("depth", 0)
            if depth not in pages_by_depth:
                pages_by_depth[depth] = []
            pages_by_depth[depth].append(result.url)

        print(f"✅ Crawled {len(results)} pages total")

        # Display crawl structure by depth
        for depth, urls in sorted(pages_by_depth.items()):
            print(f"\nDepth {depth}: {len(urls)} pages")
            # Show first 3 URLs for each depth as examples
            for url in urls[:3]:
                print(f"  → {url}")
            if len(urls) > 3:
                print(f"  ... and {len(urls) - 3} more")

        print(
            f"\n✅ Performance: {len(results)} pages in {time.perf_counter() - start_time:.2f} seconds"
        )


# 2️⃣ Stream vs. Non-Stream Execution
async def stream_vs_nonstream():
    """
    PART 2: Demonstrates the difference between stream and non-stream execution.

    Non-stream: Waits for all results before processing
    Stream: Processes results as they become available
    """
    print("\n===== STREAM VS. NON-STREAM EXECUTION =====")

    # Common configuration for both examples
    base_config = CrawlerRunConfig(
        deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=1, include_external=False),
        scraping_strategy=LXMLWebScrapingStrategy(),
        verbose=True,
    )

    async with AsyncWebCrawler() as crawler:
        # NON-STREAMING MODE
        print("\n📊 NON-STREAMING MODE:")
        print("  In this mode, all results are collected before being returned.")

        non_stream_config = base_config.clone()
        non_stream_config.stream = False

        start_time = time.perf_counter()
        results = await crawler.arun(
            url="https://docs.crawl4ai.com", config=non_stream_config
        )

        print(f"  ✅ Received all {len(results)} results at once")
        print(f"  ✅ Total duration: {time.perf_counter() - start_time:.2f} seconds")

        # STREAMING MODE
        print("\n📊 STREAMING MODE:")
        print("  In this mode, results are processed as they become available.")

        stream_config = base_config.clone()
        stream_config.stream = True

        start_time = time.perf_counter()
        result_count = 0
        first_result_time = None

        async for result in await crawler.arun(
            url="https://docs.crawl4ai.com", config=stream_config
        ):
            result_count += 1
            if result_count == 1:
                first_result_time = time.perf_counter() - start_time
                print(
                    f"  ✅ First result received after {first_result_time:.2f} seconds: {result.url}"
                )
            elif result_count % 5 == 0:  # Show every 5th result for brevity
                print(f"  → Result #{result_count}: {result.url}")

        print(f"  ✅ Total: {result_count} results")
        print(f"  ✅ First result: {first_result_time:.2f} seconds")
        print(f"  ✅ All results: {time.perf_counter() - start_time:.2f} seconds")
        print("\n🔍 Key Takeaway: Streaming allows processing results immediately")


# 3️⃣ Introduce Filters & Scorers
async def filters_and_scorers():
    """
    PART 3: Demonstrates the use of filters and scorers for more targeted crawling.

    This function progressively adds:
    1. A single URL pattern filter
    2. Multiple filters in a chain
    3. Scorers for prioritizing pages
    """
    print("\n===== FILTERS AND SCORERS =====")

    async with AsyncWebCrawler() as crawler:
        # SINGLE FILTER EXAMPLE
        print("\n📊 EXAMPLE 1: SINGLE URL PATTERN FILTER")
        print("  Only crawl pages containing 'core' in the URL")

        # Create a filter that only allows URLs with 'guide' in them
        url_filter = URLPatternFilter(patterns=["*core*"])

        config = CrawlerRunConfig(
            deep_crawl_strategy=BFSDeepCrawlStrategy(
                max_depth=1,
                include_external=False,
                filter_chain=FilterChain([url_filter]),  # Single filter
            ),
            scraping_strategy=LXMLWebScrapingStrategy(),
            cache_mode=CacheMode.BYPASS,
            verbose=True,
        )

        results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)

        print(f"  ✅ Crawled {len(results)} pages matching '*core*'")
        for result in results[:3]:  # Show first 3 results
            print(f"  → {result.url}")
        if len(results) > 3:
            print(f"  ... and {len(results) - 3} more")

        # MULTIPLE FILTERS EXAMPLE
        print("\n📊 EXAMPLE 2: MULTIPLE FILTERS IN A CHAIN")
        print("  Only crawl pages that:")
        print("  1. Contain '2024' in the URL")
        print("  2. Are from 'techcrunch.com'")
        print("  3. Are of text/html or application/javascript content type")

        # Create a chain of filters
        filter_chain = FilterChain(
            [
                URLPatternFilter(patterns=["*2024*"]),
                DomainFilter(
                    allowed_domains=["techcrunch.com"],
                    blocked_domains=["guce.techcrunch.com", "oidc.techcrunch.com"],
                ),
                ContentTypeFilter(
                    allowed_types=["text/html", "application/javascript"]
                ),
            ]
        )

        config = CrawlerRunConfig(
            deep_crawl_strategy=BFSDeepCrawlStrategy(
                max_depth=1, include_external=False, filter_chain=filter_chain
            ),
            scraping_strategy=LXMLWebScrapingStrategy(),
            verbose=True,
        )

        results = await crawler.arun(url="https://techcrunch.com", config=config)

        print(f"  ✅ Crawled {len(results)} pages after applying all filters")
        for result in results[:3]:
            print(f"  → {result.url}")
        if len(results) > 3:
            print(f"  ... and {len(results) - 3} more")

        # SCORERS EXAMPLE
        print("\n📊 EXAMPLE 3: USING A KEYWORD RELEVANCE SCORER")
        print(
            "Score pages based on relevance to keywords: 'crawl', 'example', 'async', 'configuration','javascript','css'"
        )

        # Create a keyword relevance scorer
        keyword_scorer = KeywordRelevanceScorer(
            keywords=["crawl", "example", "async", "configuration","javascript","css"], weight=0.3
        )

        config = CrawlerRunConfig(
            deep_crawl_strategy=BestFirstCrawlingStrategy(  # Note: Changed to BestFirst
                max_depth=1, include_external=False, url_scorer=keyword_scorer
            ),
            scraping_strategy=LXMLWebScrapingStrategy(),
            cache_mode=CacheMode.BYPASS,
            verbose=True,
            stream=True,
        )

        results = []
        async for result in await crawler.arun(
            url="https://docs.crawl4ai.com", config=config
        ):
            results.append(result)
            score = result.metadata.get("score")
            print(f"  → Score: {score:.2f} | {result.url}")

        print(f"  ✅ Crawler prioritized {len(results)} pages by relevance score")
        print("  🔍 Note: BestFirstCrawlingStrategy visits highest-scoring pages first")


# 4️⃣ Wrap-Up and Key Takeaways
async def wrap_up():
    """
    PART 4: Wrap-Up and Key Takeaways

    Summarize the key concepts learned in this tutorial.
    """
    print("\n===== COMPLETE CRAWLER EXAMPLE =====")
    print("Combining filters, scorers, and streaming for an optimized crawl")

    # Create a sophisticated filter chain
    filter_chain = FilterChain(
        [
            DomainFilter(
                allowed_domains=["docs.crawl4ai.com"],
                blocked_domains=["old.docs.crawl4ai.com"],
            ),
            URLPatternFilter(patterns=["*core*", "*advanced*", "*blog*"]),
            ContentTypeFilter(allowed_types=["text/html"]),
        ]
    )

    # Create a composite scorer that combines multiple scoring strategies
    keyword_scorer = KeywordRelevanceScorer(
        keywords=["crawl", "example", "async", "configuration"], weight=0.7
    )
    # Set up the configuration
    config = CrawlerRunConfig(
        deep_crawl_strategy=BestFirstCrawlingStrategy(
            max_depth=1,
            include_external=False,
            filter_chain=filter_chain,
            url_scorer=keyword_scorer,
        ),
        scraping_strategy=LXMLWebScrapingStrategy(),
        stream=True,
        verbose=True,
    )

    # Execute the crawl
    results = []
    start_time = time.perf_counter()

    async with AsyncWebCrawler() as crawler:
        async for result in await crawler.arun(
            url="https://docs.crawl4ai.com", config=config
        ):
            results.append(result)
            score = result.metadata.get("score", 0)
            depth = result.metadata.get("depth", 0)
            print(f"→ Depth: {depth} | Score: {score:.2f} | {result.url}")

    duration = time.perf_counter() - start_time

    # Summarize the results
    print(f"\n✅ Crawled {len(results)} high-value pages in {duration:.2f} seconds")
    print(
        f"✅ Average score: {sum(r.metadata.get('score', 0) for r in results) / len(results):.2f}"
    )

    # Group by depth
    depth_counts = {}
    for result in results:
        depth = result.metadata.get("depth", 0)
        depth_counts[depth] = depth_counts.get(depth, 0) + 1

    print("\n📊 Pages crawled by depth:")
    for depth, count in sorted(depth_counts.items()):
        print(f"  Depth {depth}: {count} pages")


# 5️⃣ Advanced Filters
async def advanced_filters():
    """
    PART 5: Demonstrates advanced filtering techniques for specialized crawling.

    This function covers:
    - SEO filters
    - Text relevancy filtering
    - Combining advanced filters
    """
    print("\n===== ADVANCED FILTERS =====")

    async with AsyncWebCrawler() as crawler:
        # SEO FILTER EXAMPLE
        print("\n📊 EXAMPLE 1: SEO FILTERS")
        print(
            "Quantitative SEO quality assessment filter based searching keywords in the head section"
        )

        seo_filter = SEOFilter(
            threshold=0.5, keywords=["dynamic", "interaction", "javascript"]
        )

        config = CrawlerRunConfig(
            deep_crawl_strategy=BFSDeepCrawlStrategy(
                max_depth=1, filter_chain=FilterChain([seo_filter])
            ),
            scraping_strategy=LXMLWebScrapingStrategy(),
            verbose=True,
            cache_mode=CacheMode.BYPASS,
        )

        results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)

        print(f"  ✅ Found {len(results)} pages with relevant keywords")
        for result in results:
            print(f"  → {result.url}")

        # ADVANCED TEXT RELEVANCY FILTER
        print("\n📊 EXAMPLE 2: ADVANCED TEXT RELEVANCY FILTER")

        # More sophisticated content relevance filter
        relevance_filter = ContentRelevanceFilter(
            query="Interact with the web using your authentic digital identity",
            threshold=0.7,
        )

        config = CrawlerRunConfig(
            deep_crawl_strategy=BFSDeepCrawlStrategy(
                max_depth=1, filter_chain=FilterChain([relevance_filter])
            ),
            scraping_strategy=LXMLWebScrapingStrategy(),
            verbose=True,
            cache_mode=CacheMode.BYPASS,
        )

        results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)

        print(f"  ✅ Found {len(results)} pages")
        for result in results:
            relevance_score = result.metadata.get("relevance_score", 0)
            print(f"  → Score: {relevance_score:.2f} | {result.url}")


# Main function to run the entire tutorial
async def run_tutorial():
    """
    Executes all tutorial sections in sequence.
    """
    print("\n🚀 CRAWL4AI DEEP CRAWLING TUTORIAL 🚀")
    print("======================================")
    print("This tutorial will walk you through deep crawling techniques,")
    print("from basic to advanced, using the Crawl4AI library.")

    # Define sections - uncomment to run specific parts during development
    tutorial_sections = [
        basic_deep_crawl,
        stream_vs_nonstream,
        filters_and_scorers,
        wrap_up,
        advanced_filters,
    ]

    for section in tutorial_sections:
        await section()

    print("\n🎉 TUTORIAL COMPLETE! 🎉")
    print("You now have a comprehensive understanding of deep crawling with Crawl4AI.")
    print("For more information, check out https://docs.crawl4ai.com")


# Execute the tutorial when run directly
if __name__ == "__main__":
    asyncio.run(run_tutorial())