crawl4ai/docs/examples/deepcrawl_example.py

import asyncio
import time

from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
from crawl4ai.deep_crawling.filters import (
    FilterChain,
    URLPatternFilter,
    DomainFilter,
    ContentTypeFilter,
    ContentRelevanceFilter,
    SEOFilter,
)
from crawl4ai.deep_crawling.scorers import (
    KeywordRelevanceScorer,
)


# 1️⃣ Basic Deep Crawl Setup
async def basic_deep_crawl():
    """
    PART 1: Basic Deep Crawl setup - Demonstrates a simple two-level deep crawl.

    This function shows:
    - How to set up BFSDeepCrawlStrategy (Breadth-First Search)
    - Setting depth and domain parameters
    - Processing the results to show the hierarchy
    """
    print("\n===== BASIC DEEP CRAWL SETUP =====")

    # Configure a 2-level deep crawl using Breadth-First Search strategy
    # max_depth=2 means: initial page (depth 0) + 2 more levels
    # include_external=False means: only follow links within the same domain
    config = CrawlerRunConfig(
        deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=2, include_external=False),
        scraping_strategy=LXMLWebScrapingStrategy(),
        verbose=True,  # Show progress during crawling
    )

    async with AsyncWebCrawler() as crawler:
        start_time = time.perf_counter()
        results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)

        # Group results by depth to visualize the crawl tree
        pages_by_depth = {}
        for result in results:
            depth = result.metadata.get("depth", 0)
            if depth not in pages_by_depth:
                pages_by_depth[depth] = []
            pages_by_depth[depth].append(result.url)

        print(f"✅ Crawled {len(results)} pages total")

        # Display crawl structure by depth
        for depth, urls in sorted(pages_by_depth.items()):
            print(f"\nDepth {depth}: {len(urls)} pages")
            # Show first 3 URLs for each depth as examples
            for url in urls[:3]:
                print(f"  → {url}")
            if len(urls) > 3:
                print(f"  ... and {len(urls) - 3} more")

        print(
            f"\n✅ Performance: {len(results)} pages in {time.perf_counter() - start_time:.2f} seconds"
        )

# 2️⃣ Stream vs. Non-Stream Execution
async def stream_vs_nonstream():
    """
    PART 2: Demonstrates the difference between stream and non-stream execution.

    Non-stream: Waits for all results before processing
    Stream: Processes results as they become available
    """
    print("\n===== STREAM VS. NON-STREAM EXECUTION =====")

    # Common configuration for both examples
    base_config = CrawlerRunConfig(
        deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=1, include_external=False),
        scraping_strategy=LXMLWebScrapingStrategy(),
        verbose=False,
    )

    async with AsyncWebCrawler() as crawler:
        # NON-STREAMING MODE
        print("\n📊 NON-STREAMING MODE:")
        print("  In this mode, all results are collected before being returned.")

        non_stream_config = base_config.clone()
        non_stream_config.stream = False

        start_time = time.perf_counter()
        results = await crawler.arun(
            url="https://docs.crawl4ai.com", config=non_stream_config
        )

        print(f"  ✅ Received all {len(results)} results at once")
        print(f"  ✅ Total duration: {time.perf_counter() - start_time:.2f} seconds")

        # STREAMING MODE
        print("\n📊 STREAMING MODE:")
        print("  In this mode, results are processed as they become available.")

        stream_config = base_config.clone()
        stream_config.stream = True

        start_time = time.perf_counter()
        result_count = 0
        first_result_time = None

        async for result in await crawler.arun(
            url="https://docs.crawl4ai.com", config=stream_config
        ):
            result_count += 1
            if result_count == 1:
                first_result_time = time.perf_counter() - start_time
                print(
                    f"  ✅ First result received after {first_result_time:.2f} seconds: {result.url}"
                )
            elif result_count % 5 == 0:  # Show every 5th result for brevity
                print(f"  → Result #{result_count}: {result.url}")

        print(f"  ✅ Total: {result_count} results")
        print(f"  ✅ First result: {first_result_time:.2f} seconds")
        print(f"  ✅ All results: {time.perf_counter() - start_time:.2f} seconds")
        print("\n🔍 Key Takeaway: Streaming allows processing results immediately")

# 3️⃣ Introduce Filters & Scorers
async def filters_and_scorers():
    """
    PART 3: Demonstrates the use of filters and scorers for more targeted crawling.

    This function progressively adds:
    1. A single URL pattern filter
    2. Multiple filters in a chain
    3. Scorers for prioritizing pages
    """
    print("\n===== FILTERS AND SCORERS =====")

    async with AsyncWebCrawler() as crawler:
        # SINGLE FILTER EXAMPLE
        print("\n📊 EXAMPLE 1: SINGLE URL PATTERN FILTER")
        print("  Only crawl pages containing 'core' in the URL")

        # Create a filter that only allows URLs with 'guide' in them
        url_filter = URLPatternFilter(patterns=["*core*"])

        config = CrawlerRunConfig(
            deep_crawl_strategy=BFSDeepCrawlStrategy(
                max_depth=1,
                include_external=False,
                filter_chain=FilterChain([url_filter]),  # Single filter
            ),
            scraping_strategy=LXMLWebScrapingStrategy(),
            cache_mode=CacheMode.BYPASS,
            verbose=True,
        )

        results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)

        print(f"  ✅ Crawled {len(results)} pages matching '*core*'")
        for result in results[:3]:  # Show first 3 results
            print(f"  → {result.url}")
        if len(results) > 3:
            print(f"  ... and {len(results) - 3} more")

        # MULTIPLE FILTERS EXAMPLE
        print("\n📊 EXAMPLE 2: MULTIPLE FILTERS IN A CHAIN")
        print("  Only crawl pages that:")
        print("  1. Contain '2024' in the URL")
        print("  2. Are from 'techcrunch.com'")
        print("  3. Are of text/html or application/javascript content type")

        # Create a chain of filters
        filter_chain = FilterChain(
            [
                URLPatternFilter(patterns=["*2024*"]),
                DomainFilter(
                    allowed_domains=["techcrunch.com"],
                    blocked_domains=["guce.techcrunch.com", "oidc.techcrunch.com"],
                ),
                ContentTypeFilter(
                    allowed_types=["text/html", "application/javascript"]
                ),
            ]
        )

        config = CrawlerRunConfig(
            deep_crawl_strategy=BFSDeepCrawlStrategy(
                max_depth=1, include_external=False, filter_chain=filter_chain
            ),
            scraping_strategy=LXMLWebScrapingStrategy(),
            verbose=True,
        )

        results = await crawler.arun(url="https://techcrunch.com", config=config)

        print(f"  ✅ Crawled {len(results)} pages after applying all filters")
        for result in results[:3]:
            print(f"  → {result.url}")
        if len(results) > 3:
            print(f"  ... and {len(results) - 3} more")

        # SCORERS EXAMPLE
        print("\n📊 EXAMPLE 3: USING A KEYWORD RELEVANCE SCORER")
        print(
            "Score pages based on relevance to keywords: 'crawl', 'example', 'async', 'configuration','javascript','css'"
        )

        # Create a keyword relevance scorer
        keyword_scorer = KeywordRelevanceScorer(
            keywords=["crawl", "example", "async", "configuration","javascript","css"], weight=1
        )

        config = CrawlerRunConfig(
            deep_crawl_strategy=BestFirstCrawlingStrategy(  
                max_depth=1, include_external=False, url_scorer=keyword_scorer
            ),
            scraping_strategy=LXMLWebScrapingStrategy(),
            cache_mode=CacheMode.BYPASS,
            verbose=True,
            stream=True,
        )

        results = []
        async for result in await crawler.arun(
            url="https://docs.crawl4ai.com", config=config
        ):
            results.append(result)
            score = result.metadata.get("score")
            print(f"  → Score: {score:.2f} | {result.url}")

        print(f"  ✅ Crawler prioritized {len(results)} pages by relevance score")
        print("  🔍 Note: BestFirstCrawlingStrategy visits highest-scoring pages first")

# 4️⃣ Advanced Filters
async def advanced_filters():
    """
    PART 4: Demonstrates advanced filtering techniques for specialized crawling.

    This function covers:
    - SEO filters
    - Text relevancy filtering
    - Combining advanced filters
    """
    print("\n===== ADVANCED FILTERS =====")

    async with AsyncWebCrawler() as crawler:
        # SEO FILTER EXAMPLE
        print("\n📊 EXAMPLE 1: SEO FILTERS")
        print(
            "Quantitative SEO quality assessment filter based searching keywords in the head section"
        )

        seo_filter = SEOFilter(
            threshold=0.5, keywords=["dynamic", "interaction", "javascript"]
        )

        config = CrawlerRunConfig(
            deep_crawl_strategy=BFSDeepCrawlStrategy(
                max_depth=1, filter_chain=FilterChain([seo_filter])
            ),
            scraping_strategy=LXMLWebScrapingStrategy(),
            verbose=True,
            cache_mode=CacheMode.BYPASS,
        )

        results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)

        print(f"  ✅ Found {len(results)} pages with relevant keywords")
        for result in results:
            print(f"  → {result.url}")

        # ADVANCED TEXT RELEVANCY FILTER
        print("\n📊 EXAMPLE 2: ADVANCED TEXT RELEVANCY FILTER")

        # More sophisticated content relevance filter
        relevance_filter = ContentRelevanceFilter(
            query="Interact with the web using your authentic digital identity",
            threshold=0.7,
        )

        config = CrawlerRunConfig(
            deep_crawl_strategy=BFSDeepCrawlStrategy(
                max_depth=1, filter_chain=FilterChain([relevance_filter])
            ),
            scraping_strategy=LXMLWebScrapingStrategy(),
            verbose=True,
            cache_mode=CacheMode.BYPASS,
        )

        results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)

        print(f"  ✅ Found {len(results)} pages")
        for result in results:
            relevance_score = result.metadata.get("relevance_score", 0)
            print(f"  → Score: {relevance_score:.2f} | {result.url}")

# 5️⃣ Max Pages and Score Thresholds
async def max_pages_and_thresholds():
    """
    PART 5: Demonstrates using max_pages and score_threshold parameters with different strategies.
    
    This function shows:
    - How to limit the number of pages crawled
    - How to set score thresholds for more targeted crawling
    - Comparing BFS, DFS, and Best-First strategies with these parameters
    """
    print("\n===== MAX PAGES AND SCORE THRESHOLDS =====")
    
    from crawl4ai.deep_crawling import DFSDeepCrawlStrategy
    
    async with AsyncWebCrawler() as crawler:
        # Define a common keyword scorer for all examples
        keyword_scorer = KeywordRelevanceScorer(
            keywords=["browser", "crawler", "web", "automation"], 
            weight=1.0
        )
        
        # EXAMPLE 1: BFS WITH MAX PAGES
        print("\n📊 EXAMPLE 1: BFS STRATEGY WITH MAX PAGES LIMIT")
        print("  Limit the crawler to a maximum of 5 pages")
        
        bfs_config = CrawlerRunConfig(
            deep_crawl_strategy=BFSDeepCrawlStrategy(
                max_depth=2, 
                include_external=False,
                url_scorer=keyword_scorer,
                max_pages=5  # Only crawl 5 pages
            ),
            scraping_strategy=LXMLWebScrapingStrategy(),
            verbose=True,
            cache_mode=CacheMode.BYPASS,
        )
        
        results = await crawler.arun(url="https://docs.crawl4ai.com", config=bfs_config)
        
        print(f"  ✅ Crawled exactly {len(results)} pages as specified by max_pages")
        for result in results:
            depth = result.metadata.get("depth", 0)
            print(f"  → Depth: {depth} | {result.url}")
            
        # EXAMPLE 2: DFS WITH SCORE THRESHOLD
        print("\n📊 EXAMPLE 2: DFS STRATEGY WITH SCORE THRESHOLD")
        print("  Only crawl pages with a relevance score above 0.5")
        
        dfs_config = CrawlerRunConfig(
            deep_crawl_strategy=DFSDeepCrawlStrategy(
                max_depth=2,
                include_external=False, 
                url_scorer=keyword_scorer,
                score_threshold=0.7,  # Only process URLs with scores above 0.5
                max_pages=10
            ),
            scraping_strategy=LXMLWebScrapingStrategy(),
            verbose=True,
            cache_mode=CacheMode.BYPASS,
        )
        
        results = await crawler.arun(url="https://docs.crawl4ai.com", config=dfs_config)
        
        print(f"  ✅ Crawled {len(results)} pages with scores above threshold")
        for result in results:
            score = result.metadata.get("score", 0)
            depth = result.metadata.get("depth", 0)
            print(f"  → Depth: {depth} | Score: {score:.2f} | {result.url}")
            
        # EXAMPLE 3: BEST-FIRST WITH BOTH CONSTRAINTS
        print("\n📊 EXAMPLE 3: BEST-FIRST STRATEGY WITH BOTH CONSTRAINTS")
        print("  Limit to 7 pages with scores above 0.3, prioritizing highest scores")
        
        bf_config = CrawlerRunConfig(
            deep_crawl_strategy=BestFirstCrawlingStrategy(
                max_depth=2,
                include_external=False,
                url_scorer=keyword_scorer,
                max_pages=7,          # Limit to 7 pages total
            ),
            scraping_strategy=LXMLWebScrapingStrategy(),
            verbose=True,
            cache_mode=CacheMode.BYPASS,
            stream=True,
        )
        
        results = []
        async for result in await crawler.arun(url="https://docs.crawl4ai.com", config=bf_config):
            results.append(result)
            score = result.metadata.get("score", 0)
            depth = result.metadata.get("depth", 0)
            print(f"  → Depth: {depth} | Score: {score:.2f} | {result.url}")
            
        print(f"  ✅ Crawled {len(results)} high-value pages with scores above 0.3")
        if results:
            avg_score = sum(r.metadata.get('score', 0) for r in results) / len(results)
            print(f"  ✅ Average score: {avg_score:.2f}")
            print("  🔍 Note: BestFirstCrawlingStrategy visited highest-scoring pages first")

# 6️⃣ Wrap-Up and Key Takeaways
async def wrap_up():
    """
    PART 6: Wrap-Up and Key Takeaways

    Summarize the key concepts learned in this tutorial.
    """
    print("\n===== COMPLETE CRAWLER EXAMPLE =====")
    print("Combining filters, scorers, and streaming for an optimized crawl")

    # Create a sophisticated filter chain
    filter_chain = FilterChain(
        [
            DomainFilter(
                allowed_domains=["docs.crawl4ai.com"],
                blocked_domains=["old.docs.crawl4ai.com"],
            ),
            URLPatternFilter(patterns=["*core*", "*advanced*", "*blog*"]),
            ContentTypeFilter(allowed_types=["text/html"]),
        ]
    )

    # Create a composite scorer that combines multiple scoring strategies
    keyword_scorer = KeywordRelevanceScorer(
        keywords=["crawl", "example", "async", "configuration"], weight=0.7
    )
    # Set up the configuration
    config = CrawlerRunConfig(
        deep_crawl_strategy=BestFirstCrawlingStrategy(
            max_depth=1,
            include_external=False,
            filter_chain=filter_chain,
            url_scorer=keyword_scorer,
        ),
        scraping_strategy=LXMLWebScrapingStrategy(),
        stream=True,
        verbose=True,
    )

    # Execute the crawl
    results = []
    start_time = time.perf_counter()

    async with AsyncWebCrawler() as crawler:
        async for result in await crawler.arun(
            url="https://docs.crawl4ai.com", config=config
        ):
            results.append(result)
            score = result.metadata.get("score", 0)
            depth = result.metadata.get("depth", 0)
            print(f"→ Depth: {depth} | Score: {score:.2f} | {result.url}")

    duration = time.perf_counter() - start_time

    # Summarize the results
    print(f"\n✅ Crawled {len(results)} high-value pages in {duration:.2f} seconds")
    print(
        f"✅ Average score: {sum(r.metadata.get('score', 0) for r in results) / len(results):.2f}"
    )

    # Group by depth
    depth_counts = {}
    for result in results:
        depth = result.metadata.get("depth", 0)
        depth_counts[depth] = depth_counts.get(depth, 0) + 1

    print("\n📊 Pages crawled by depth:")
    for depth, count in sorted(depth_counts.items()):
        print(f"  Depth {depth}: {count} pages")


async def run_tutorial():
    """
    Executes all tutorial sections in sequence.
    """
    print("\n🚀 CRAWL4AI DEEP CRAWLING TUTORIAL 🚀")
    print("======================================")
    print("This tutorial will walk you through deep crawling techniques,")
    print("from basic to advanced, using the Crawl4AI library.")

    # Define sections - uncomment to run specific parts during development
    tutorial_sections = [
        basic_deep_crawl,
        stream_vs_nonstream,
        filters_and_scorers,
        max_pages_and_thresholds, 
        advanced_filters,
        wrap_up,
    ]

    for section in tutorial_sections:
        await section()

    print("\n🎉 TUTORIAL COMPLETE! 🎉")
    print("You now have a comprehensive understanding of deep crawling with Crawl4AI.")
    print("For more information, check out https://docs.crawl4ai.com")

# Execute the tutorial when run directly
if __name__ == "__main__":
    asyncio.run(run_tutorial())
-												Release prep (#749)

* fix: Update export of URLPatternFilter

* chore: Add dependancy for cchardet in requirements

* docs: Update example for deep crawl in release note for v0.5

* Docs: update the example for memory dispatcher

* docs: updated example for crawl strategies

* Refactor: Removed wrapping in if __name__==main block since this is a markdown file.

* chore: removed cchardet from dependancy list, since unclecode is planning to remove it

* docs: updated the example for proxy rotation to a working example

* feat: Introduced ProxyConfig param

* Add tutorial for deep crawl & update contributor list for bug fixes in feb alpha-1

* chore: update and test new dependancies

* feat:Make PyPDF2 a conditional dependancy

* updated tutorial and release note for v0.5

* docs: update docs for deep crawl, and fix a typo in docker-deployment markdown filename

* refactor: 1. Deprecate markdown_v2 2. Make markdown backward compatible to behave as a string when needed. 3. Fix LlmConfig usage in cli 4. Deprecate markdown_v2 in cli 5. Update AsyncWebCrawler for changes in CrawlResult

* fix: Bug in serialisation of markdown in acache_url

* Refactor: Added deprecation errors for fit_html and fit_markdown directly on markdown. Now access them via markdown

* fix: remove deprecated markdown_v2 from docker

* Refactor: remove deprecated fit_markdown and fit_html from result

* refactor: fix cache retrieval for markdown as a string

* chore: update all docs, examples and tests with deprecation announcements for markdown_v2, fit_html, fit_markdown
											
										
										
											2025-02-28 17:23:35 +05:30
+								import asyncio
 								import time
 								from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode
 								from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
 								from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
 								from crawl4ai.deep_crawling.filters import (
 								    FilterChain,
 								    URLPatternFilter,
 								    DomainFilter,
 								    ContentTypeFilter,
 								    ContentRelevanceFilter,
 								    SEOFilter,
 								)
 								from crawl4ai.deep_crawling.scorers import (
 								    KeywordRelevanceScorer,
 								)
 								# 1️⃣ Basic Deep Crawl Setup
 								async def basic_deep_crawl():
 								    """
 								    PART 1: Basic Deep Crawl setup - Demonstrates a simple two-level deep crawl.
 								    This function shows:
 								    - How to set up BFSDeepCrawlStrategy (Breadth-First Search)
 								    - Setting depth and domain parameters
 								    - Processing the results to show the hierarchy
 								    """
 								    print("\n===== BASIC DEEP CRAWL SETUP =====")
 								    # Configure a 2-level deep crawl using Breadth-First Search strategy
 								    # max_depth=2 means: initial page (depth 0) + 2 more levels
 								    # include_external=False means: only follow links within the same domain
 								    config = CrawlerRunConfig(
 								        deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=2, include_external=False),
 								        scraping_strategy=LXMLWebScrapingStrategy(),
 								        verbose=True,  # Show progress during crawling
 								    )
 								    async with AsyncWebCrawler() as crawler:
 								        start_time = time.perf_counter()
 								        results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)
 								        # Group results by depth to visualize the crawl tree
 								        pages_by_depth = {}
 								        for result in results:
 								            depth = result.metadata.get("depth", 0)
 								            if depth not in pages_by_depth:
 								                pages_by_depth[depth] = []
 								            pages_by_depth[depth].append(result.url)
 								        print(f"✅ Crawled {len(results)} pages total")
 								        # Display crawl structure by depth
 								        for depth, urls in sorted(pages_by_depth.items()):
 								            print(f"\nDepth {depth}: {len(urls)} pages")
 								            # Show first 3 URLs for each depth as examples
 								            for url in urls[:3]:
 								                print(f"  → {url}")
 								            if len(urls) > 3:
 								                print(f"  ... and {len(urls) - 3} more")
 								        print(
 								            f"\n✅ Performance: {len(results)} pages in {time.perf_counter() - start_time:.2f} seconds"
 								        )
 								# 2️⃣ Stream vs. Non-Stream Execution
 								async def stream_vs_nonstream():
 								    """
 								    PART 2: Demonstrates the difference between stream and non-stream execution.
 								    Non-stream: Waits for all results before processing
 								    Stream: Processes results as they become available
 								    """
 								    print("\n===== STREAM VS. NON-STREAM EXECUTION =====")
 								    # Common configuration for both examples
 								    base_config = CrawlerRunConfig(
 								        deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=1, include_external=False),
 								        scraping_strategy=LXMLWebScrapingStrategy(),
-												refactor(deep-crawl): add max_pages limit and improve crawl control

Add max_pages parameter to all deep crawling strategies to limit total pages crawled.
Add score_threshold parameter to BFS/DFS strategies for quality control.
Remove legacy parameter handling in AsyncWebCrawler.
Improve error handling and logging in crawl strategies.

BREAKING CHANGE: Removed support for legacy parameters in AsyncWebCrawler.run_many()

											
										
										
											2025-03-03 21:51:11 +08:00
+								        verbose=False,
-												Release prep (#749)

* fix: Update export of URLPatternFilter

* chore: Add dependancy for cchardet in requirements

* docs: Update example for deep crawl in release note for v0.5

* Docs: update the example for memory dispatcher

* docs: updated example for crawl strategies

* Refactor: Removed wrapping in if __name__==main block since this is a markdown file.

* chore: removed cchardet from dependancy list, since unclecode is planning to remove it

* docs: updated the example for proxy rotation to a working example

* feat: Introduced ProxyConfig param

* Add tutorial for deep crawl & update contributor list for bug fixes in feb alpha-1

* chore: update and test new dependancies

* feat:Make PyPDF2 a conditional dependancy

* updated tutorial and release note for v0.5

* docs: update docs for deep crawl, and fix a typo in docker-deployment markdown filename

* refactor: 1. Deprecate markdown_v2 2. Make markdown backward compatible to behave as a string when needed. 3. Fix LlmConfig usage in cli 4. Deprecate markdown_v2 in cli 5. Update AsyncWebCrawler for changes in CrawlResult

* fix: Bug in serialisation of markdown in acache_url

* Refactor: Added deprecation errors for fit_html and fit_markdown directly on markdown. Now access them via markdown

* fix: remove deprecated markdown_v2 from docker

* Refactor: remove deprecated fit_markdown and fit_html from result

* refactor: fix cache retrieval for markdown as a string

* chore: update all docs, examples and tests with deprecation announcements for markdown_v2, fit_html, fit_markdown
											
										
										
											2025-02-28 17:23:35 +05:30
+								    )
 								    async with AsyncWebCrawler() as crawler:
 								        # NON-STREAMING MODE
 								        print("\n📊 NON-STREAMING MODE:")
 								        print("  In this mode, all results are collected before being returned.")
 								        non_stream_config = base_config.clone()
 								        non_stream_config.stream = False
 								        start_time = time.perf_counter()
 								        results = await crawler.arun(
 								            url="https://docs.crawl4ai.com", config=non_stream_config
 								        )
 								        print(f"  ✅ Received all {len(results)} results at once")
 								        print(f"  ✅ Total duration: {time.perf_counter() - start_time:.2f} seconds")
 								        # STREAMING MODE
 								        print("\n📊 STREAMING MODE:")
 								        print("  In this mode, results are processed as they become available.")
 								        stream_config = base_config.clone()
 								        stream_config.stream = True
 								        start_time = time.perf_counter()
 								        result_count = 0
 								        first_result_time = None
 								        async for result in await crawler.arun(
 								            url="https://docs.crawl4ai.com", config=stream_config
 								        ):
 								            result_count += 1
 								            if result_count == 1:
 								                first_result_time = time.perf_counter() - start_time
 								                print(
 								                    f"  ✅ First result received after {first_result_time:.2f} seconds: {result.url}"
 								                )
 								            elif result_count % 5 == 0:  # Show every 5th result for brevity
 								                print(f"  → Result #{result_count}: {result.url}")
 								        print(f"  ✅ Total: {result_count} results")
 								        print(f"  ✅ First result: {first_result_time:.2f} seconds")
 								        print(f"  ✅ All results: {time.perf_counter() - start_time:.2f} seconds")
 								        print("\n🔍 Key Takeaway: Streaming allows processing results immediately")
 								# 3️⃣ Introduce Filters & Scorers
 								async def filters_and_scorers():
 								    """
 								    PART 3: Demonstrates the use of filters and scorers for more targeted crawling.
 								    This function progressively adds:
 . A single URL pattern filter
 . Multiple filters in a chain
 . Scorers for prioritizing pages
 								    """
 								    print("\n===== FILTERS AND SCORERS =====")
 								    async with AsyncWebCrawler() as crawler:
 								        # SINGLE FILTER EXAMPLE
 								        print("\n📊 EXAMPLE 1: SINGLE URL PATTERN FILTER")
 								        print("  Only crawl pages containing 'core' in the URL")
 								        # Create a filter that only allows URLs with 'guide' in them
 								        url_filter = URLPatternFilter(patterns=["*core*"])
 								        config = CrawlerRunConfig(
 								            deep_crawl_strategy=BFSDeepCrawlStrategy(
 								                max_depth=1,
 								                include_external=False,
 								                filter_chain=FilterChain([url_filter]),  # Single filter
 								            ),
 								            scraping_strategy=LXMLWebScrapingStrategy(),
 								            cache_mode=CacheMode.BYPASS,
 								            verbose=True,
 								        )
 								        results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)
 								        print(f"  ✅ Crawled {len(results)} pages matching '*core*'")
 								        for result in results[:3]:  # Show first 3 results
 								            print(f"  → {result.url}")
 								        if len(results) > 3:
 								            print(f"  ... and {len(results) - 3} more")
 								        # MULTIPLE FILTERS EXAMPLE
 								        print("\n📊 EXAMPLE 2: MULTIPLE FILTERS IN A CHAIN")
 								        print("  Only crawl pages that:")
 								        print("  1. Contain '2024' in the URL")
 								        print("  2. Are from 'techcrunch.com'")
 								        print("  3. Are of text/html or application/javascript content type")
 								        # Create a chain of filters
 								        filter_chain = FilterChain(
 								            [
 								                URLPatternFilter(patterns=["*2024*"]),
 								                DomainFilter(
 								                    allowed_domains=["techcrunch.com"],
 								                    blocked_domains=["guce.techcrunch.com", "oidc.techcrunch.com"],
 								                ),
 								                ContentTypeFilter(
 								                    allowed_types=["text/html", "application/javascript"]
 								                ),
 								            ]
 								        )
 								        config = CrawlerRunConfig(
 								            deep_crawl_strategy=BFSDeepCrawlStrategy(
 								                max_depth=1, include_external=False, filter_chain=filter_chain
 								            ),
 								            scraping_strategy=LXMLWebScrapingStrategy(),
 								            verbose=True,
 								        )
 								        results = await crawler.arun(url="https://techcrunch.com", config=config)
 								        print(f"  ✅ Crawled {len(results)} pages after applying all filters")
 								        for result in results[:3]:
 								            print(f"  → {result.url}")
 								        if len(results) > 3:
 								            print(f"  ... and {len(results) - 3} more")
 								        # SCORERS EXAMPLE
 								        print("\n📊 EXAMPLE 3: USING A KEYWORD RELEVANCE SCORER")
 								        print(
 								            "Score pages based on relevance to keywords: 'crawl', 'example', 'async', 'configuration','javascript','css'"
 								        )
 								        # Create a keyword relevance scorer
 								        keyword_scorer = KeywordRelevanceScorer(
-												refactor(deep-crawl): add max_pages limit and improve crawl control

Add max_pages parameter to all deep crawling strategies to limit total pages crawled.
Add score_threshold parameter to BFS/DFS strategies for quality control.
Remove legacy parameter handling in AsyncWebCrawler.
Improve error handling and logging in crawl strategies.

BREAKING CHANGE: Removed support for legacy parameters in AsyncWebCrawler.run_many()

											
										
										
											2025-03-03 21:51:11 +08:00
+								            keywords=["crawl", "example", "async", "configuration","javascript","css"], weight=1
-												Release prep (#749)

* fix: Update export of URLPatternFilter

* chore: Add dependancy for cchardet in requirements

* docs: Update example for deep crawl in release note for v0.5

* Docs: update the example for memory dispatcher

* docs: updated example for crawl strategies

* Refactor: Removed wrapping in if __name__==main block since this is a markdown file.

* chore: removed cchardet from dependancy list, since unclecode is planning to remove it

* docs: updated the example for proxy rotation to a working example

* feat: Introduced ProxyConfig param

* Add tutorial for deep crawl & update contributor list for bug fixes in feb alpha-1

* chore: update and test new dependancies

* feat:Make PyPDF2 a conditional dependancy

* updated tutorial and release note for v0.5

* docs: update docs for deep crawl, and fix a typo in docker-deployment markdown filename

* refactor: 1. Deprecate markdown_v2 2. Make markdown backward compatible to behave as a string when needed. 3. Fix LlmConfig usage in cli 4. Deprecate markdown_v2 in cli 5. Update AsyncWebCrawler for changes in CrawlResult

* fix: Bug in serialisation of markdown in acache_url

* Refactor: Added deprecation errors for fit_html and fit_markdown directly on markdown. Now access them via markdown

* fix: remove deprecated markdown_v2 from docker

* Refactor: remove deprecated fit_markdown and fit_html from result

* refactor: fix cache retrieval for markdown as a string

* chore: update all docs, examples and tests with deprecation announcements for markdown_v2, fit_html, fit_markdown
											
										
										
											2025-02-28 17:23:35 +05:30
+								        )
 								        config = CrawlerRunConfig(
-												refactor(deep-crawl): add max_pages limit and improve crawl control

Add max_pages parameter to all deep crawling strategies to limit total pages crawled.
Add score_threshold parameter to BFS/DFS strategies for quality control.
Remove legacy parameter handling in AsyncWebCrawler.
Improve error handling and logging in crawl strategies.

BREAKING CHANGE: Removed support for legacy parameters in AsyncWebCrawler.run_many()

											
										
										
											2025-03-03 21:51:11 +08:00
+								            deep_crawl_strategy=BestFirstCrawlingStrategy(
-												Release prep (#749)

* fix: Update export of URLPatternFilter

* chore: Add dependancy for cchardet in requirements

* docs: Update example for deep crawl in release note for v0.5

* Docs: update the example for memory dispatcher

* docs: updated example for crawl strategies

* Refactor: Removed wrapping in if __name__==main block since this is a markdown file.

* chore: removed cchardet from dependancy list, since unclecode is planning to remove it

* docs: updated the example for proxy rotation to a working example

* feat: Introduced ProxyConfig param

* Add tutorial for deep crawl & update contributor list for bug fixes in feb alpha-1

* chore: update and test new dependancies

* feat:Make PyPDF2 a conditional dependancy

* updated tutorial and release note for v0.5

* docs: update docs for deep crawl, and fix a typo in docker-deployment markdown filename

* refactor: 1. Deprecate markdown_v2 2. Make markdown backward compatible to behave as a string when needed. 3. Fix LlmConfig usage in cli 4. Deprecate markdown_v2 in cli 5. Update AsyncWebCrawler for changes in CrawlResult

* fix: Bug in serialisation of markdown in acache_url

* Refactor: Added deprecation errors for fit_html and fit_markdown directly on markdown. Now access them via markdown

* fix: remove deprecated markdown_v2 from docker

* Refactor: remove deprecated fit_markdown and fit_html from result

* refactor: fix cache retrieval for markdown as a string

* chore: update all docs, examples and tests with deprecation announcements for markdown_v2, fit_html, fit_markdown
											
										
										
											2025-02-28 17:23:35 +05:30
+								                max_depth=1, include_external=False, url_scorer=keyword_scorer
 								            ),
 								            scraping_strategy=LXMLWebScrapingStrategy(),
 								            cache_mode=CacheMode.BYPASS,
 								            verbose=True,
 								            stream=True,
 								        )
 								        results = []
 								        async for result in await crawler.arun(
 								            url="https://docs.crawl4ai.com", config=config
 								        ):
 								            results.append(result)
 								            score = result.metadata.get("score")
 								            print(f"  → Score: {score:.2f} | {result.url}")
 								        print(f"  ✅ Crawler prioritized {len(results)} pages by relevance score")
 								        print("  🔍 Note: BestFirstCrawlingStrategy visits highest-scoring pages first")
-												refactor(docs): reorganize tutorial sections and update wrap-up example

											
										
										
											2025-03-04 20:55:09 +08:00
+								# 4️⃣ Advanced Filters
-												Release prep (#749)

* fix: Update export of URLPatternFilter

* chore: Add dependancy for cchardet in requirements

* docs: Update example for deep crawl in release note for v0.5

* Docs: update the example for memory dispatcher

* docs: updated example for crawl strategies

* Refactor: Removed wrapping in if __name__==main block since this is a markdown file.

* chore: removed cchardet from dependancy list, since unclecode is planning to remove it

* docs: updated the example for proxy rotation to a working example

* feat: Introduced ProxyConfig param

* Add tutorial for deep crawl & update contributor list for bug fixes in feb alpha-1

* chore: update and test new dependancies

* feat:Make PyPDF2 a conditional dependancy

* updated tutorial and release note for v0.5

* docs: update docs for deep crawl, and fix a typo in docker-deployment markdown filename

* refactor: 1. Deprecate markdown_v2 2. Make markdown backward compatible to behave as a string when needed. 3. Fix LlmConfig usage in cli 4. Deprecate markdown_v2 in cli 5. Update AsyncWebCrawler for changes in CrawlResult

* fix: Bug in serialisation of markdown in acache_url

* Refactor: Added deprecation errors for fit_html and fit_markdown directly on markdown. Now access them via markdown

* fix: remove deprecated markdown_v2 from docker

* Refactor: remove deprecated fit_markdown and fit_html from result

* refactor: fix cache retrieval for markdown as a string

* chore: update all docs, examples and tests with deprecation announcements for markdown_v2, fit_html, fit_markdown
											
										
										
											2025-02-28 17:23:35 +05:30
+								async def advanced_filters():
 								    """
-												fix(docs): correct section numbering in deepcrawl_example.py tutorial

											
										
										
											2025-03-04 20:57:33 +08:00
+								    PART 4: Demonstrates advanced filtering techniques for specialized crawling.
-												Release prep (#749)

* fix: Update export of URLPatternFilter

* chore: Add dependancy for cchardet in requirements

* docs: Update example for deep crawl in release note for v0.5

* Docs: update the example for memory dispatcher

* docs: updated example for crawl strategies

* Refactor: Removed wrapping in if __name__==main block since this is a markdown file.

* chore: removed cchardet from dependancy list, since unclecode is planning to remove it

* docs: updated the example for proxy rotation to a working example

* feat: Introduced ProxyConfig param

* Add tutorial for deep crawl & update contributor list for bug fixes in feb alpha-1

* chore: update and test new dependancies

* feat:Make PyPDF2 a conditional dependancy

* updated tutorial and release note for v0.5

* docs: update docs for deep crawl, and fix a typo in docker-deployment markdown filename

* refactor: 1. Deprecate markdown_v2 2. Make markdown backward compatible to behave as a string when needed. 3. Fix LlmConfig usage in cli 4. Deprecate markdown_v2 in cli 5. Update AsyncWebCrawler for changes in CrawlResult

* fix: Bug in serialisation of markdown in acache_url

* Refactor: Added deprecation errors for fit_html and fit_markdown directly on markdown. Now access them via markdown

* fix: remove deprecated markdown_v2 from docker

* Refactor: remove deprecated fit_markdown and fit_html from result

* refactor: fix cache retrieval for markdown as a string

* chore: update all docs, examples and tests with deprecation announcements for markdown_v2, fit_html, fit_markdown
											
										
										
											2025-02-28 17:23:35 +05:30
 								    This function covers:
 								    - SEO filters
 								    - Text relevancy filtering
 								    - Combining advanced filters
 								    """
 								    print("\n===== ADVANCED FILTERS =====")
 								    async with AsyncWebCrawler() as crawler:
 								        # SEO FILTER EXAMPLE
 								        print("\n📊 EXAMPLE 1: SEO FILTERS")
 								        print(
 								            "Quantitative SEO quality assessment filter based searching keywords in the head section"
 								        )
 								        seo_filter = SEOFilter(
 								            threshold=0.5, keywords=["dynamic", "interaction", "javascript"]
 								        )
 								        config = CrawlerRunConfig(
 								            deep_crawl_strategy=BFSDeepCrawlStrategy(
 								                max_depth=1, filter_chain=FilterChain([seo_filter])
 								            ),
 								            scraping_strategy=LXMLWebScrapingStrategy(),
 								            verbose=True,
 								            cache_mode=CacheMode.BYPASS,
 								        )
 								        results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)
 								        print(f"  ✅ Found {len(results)} pages with relevant keywords")
 								        for result in results:
 								            print(f"  → {result.url}")
 								        # ADVANCED TEXT RELEVANCY FILTER
 								        print("\n📊 EXAMPLE 2: ADVANCED TEXT RELEVANCY FILTER")
 								        # More sophisticated content relevance filter
 								        relevance_filter = ContentRelevanceFilter(
 								            query="Interact with the web using your authentic digital identity",
 								            threshold=0.7,
 								        )
 								        config = CrawlerRunConfig(
 								            deep_crawl_strategy=BFSDeepCrawlStrategy(
 								                max_depth=1, filter_chain=FilterChain([relevance_filter])
 								            ),
 								            scraping_strategy=LXMLWebScrapingStrategy(),
 								            verbose=True,
 								            cache_mode=CacheMode.BYPASS,
 								        )
 								        results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)
 								        print(f"  ✅ Found {len(results)} pages")
 								        for result in results:
 								            relevance_score = result.metadata.get("relevance_score", 0)
 								            print(f"  → Score: {relevance_score:.2f} | {result.url}")
-												refactor(docs): reorganize tutorial sections and update wrap-up example

											
										
										
											2025-03-04 20:55:09 +08:00
+								# 5️⃣ Max Pages and Score Thresholds
-												refactor(deep-crawl): add max_pages limit and improve crawl control

Add max_pages parameter to all deep crawling strategies to limit total pages crawled.
Add score_threshold parameter to BFS/DFS strategies for quality control.
Remove legacy parameter handling in AsyncWebCrawler.
Improve error handling and logging in crawl strategies.

BREAKING CHANGE: Removed support for legacy parameters in AsyncWebCrawler.run_many()

											
										
										
											2025-03-03 21:51:11 +08:00
+								async def max_pages_and_thresholds():
 								    """
-												fix(docs): correct section numbering in deepcrawl_example.py tutorial

											
										
										
											2025-03-04 20:57:33 +08:00
+								    PART 5: Demonstrates using max_pages and score_threshold parameters with different strategies.
-												refactor(deep-crawl): add max_pages limit and improve crawl control

Add max_pages parameter to all deep crawling strategies to limit total pages crawled.
Add score_threshold parameter to BFS/DFS strategies for quality control.
Remove legacy parameter handling in AsyncWebCrawler.
Improve error handling and logging in crawl strategies.

BREAKING CHANGE: Removed support for legacy parameters in AsyncWebCrawler.run_many()

											
										
										
											2025-03-03 21:51:11 +08:00
 								    This function shows:
 								    - How to limit the number of pages crawled
 								    - How to set score thresholds for more targeted crawling
 								    - Comparing BFS, DFS, and Best-First strategies with these parameters
 								    """
 								    print("\n===== MAX PAGES AND SCORE THRESHOLDS =====")
 								    from crawl4ai.deep_crawling import DFSDeepCrawlStrategy
 								    async with AsyncWebCrawler() as crawler:
 								        # Define a common keyword scorer for all examples
 								        keyword_scorer = KeywordRelevanceScorer(
 								            keywords=["browser", "crawler", "web", "automation"],
 								            weight=1.0
 								        )
 								        # EXAMPLE 1: BFS WITH MAX PAGES
 								        print("\n📊 EXAMPLE 1: BFS STRATEGY WITH MAX PAGES LIMIT")
 								        print("  Limit the crawler to a maximum of 5 pages")
 								        bfs_config = CrawlerRunConfig(
 								            deep_crawl_strategy=BFSDeepCrawlStrategy(
 								                max_depth=2,
 								                include_external=False,
 								                url_scorer=keyword_scorer,
 								                max_pages=5  # Only crawl 5 pages
 								            ),
 								            scraping_strategy=LXMLWebScrapingStrategy(),
 								            verbose=True,
 								            cache_mode=CacheMode.BYPASS,
 								        )
 								        results = await crawler.arun(url="https://docs.crawl4ai.com", config=bfs_config)
 								        print(f"  ✅ Crawled exactly {len(results)} pages as specified by max_pages")
 								        for result in results:
 								            depth = result.metadata.get("depth", 0)
 								            print(f"  → Depth: {depth} | {result.url}")
 								        # EXAMPLE 2: DFS WITH SCORE THRESHOLD
 								        print("\n📊 EXAMPLE 2: DFS STRATEGY WITH SCORE THRESHOLD")
 								        print("  Only crawl pages with a relevance score above 0.5")
 								        dfs_config = CrawlerRunConfig(
 								            deep_crawl_strategy=DFSDeepCrawlStrategy(
 								                max_depth=2,
 								                include_external=False,
 								                url_scorer=keyword_scorer,
 								                score_threshold=0.7,  # Only process URLs with scores above 0.5
 								                max_pages=10
 								            ),
 								            scraping_strategy=LXMLWebScrapingStrategy(),
 								            verbose=True,
 								            cache_mode=CacheMode.BYPASS,
 								        )
 								        results = await crawler.arun(url="https://docs.crawl4ai.com", config=dfs_config)
 								        print(f"  ✅ Crawled {len(results)} pages with scores above threshold")
 								        for result in results:
 								            score = result.metadata.get("score", 0)
 								            depth = result.metadata.get("depth", 0)
 								            print(f"  → Depth: {depth} | Score: {score:.2f} | {result.url}")
 								        # EXAMPLE 3: BEST-FIRST WITH BOTH CONSTRAINTS
 								        print("\n📊 EXAMPLE 3: BEST-FIRST STRATEGY WITH BOTH CONSTRAINTS")
 								        print("  Limit to 7 pages with scores above 0.3, prioritizing highest scores")
 								        bf_config = CrawlerRunConfig(
 								            deep_crawl_strategy=BestFirstCrawlingStrategy(
 								                max_depth=2,
 								                include_external=False,
 								                url_scorer=keyword_scorer,
 								                max_pages=7,          # Limit to 7 pages total
 								            ),
 								            scraping_strategy=LXMLWebScrapingStrategy(),
 								            verbose=True,
 								            cache_mode=CacheMode.BYPASS,
 								            stream=True,
 								        )
 								        results = []
 								        async for result in await crawler.arun(url="https://docs.crawl4ai.com", config=bf_config):
 								            results.append(result)
 								            score = result.metadata.get("score", 0)
 								            depth = result.metadata.get("depth", 0)
 								            print(f"  → Depth: {depth} | Score: {score:.2f} | {result.url}")
 								        print(f"  ✅ Crawled {len(results)} high-value pages with scores above 0.3")
 								        if results:
 								            avg_score = sum(r.metadata.get('score', 0) for r in results) / len(results)
 								            print(f"  ✅ Average score: {avg_score:.2f}")
 								            print("  🔍 Note: BestFirstCrawlingStrategy visited highest-scoring pages first")
-												refactor(docs): reorganize tutorial sections and update wrap-up example

											
										
										
											2025-03-04 20:55:09 +08:00
+								# 6️⃣ Wrap-Up and Key Takeaways
 								async def wrap_up():
 								    """
-												fix(docs): correct section numbering in deepcrawl_example.py tutorial

											
										
										
											2025-03-04 20:57:33 +08:00
+								    PART 6: Wrap-Up and Key Takeaways
-												refactor(docs): reorganize tutorial sections and update wrap-up example

											
										
										
											2025-03-04 20:55:09 +08:00
 								    Summarize the key concepts learned in this tutorial.
 								    """
 								    print("\n===== COMPLETE CRAWLER EXAMPLE =====")
 								    print("Combining filters, scorers, and streaming for an optimized crawl")
 								    # Create a sophisticated filter chain
 								    filter_chain = FilterChain(
 								        [
 								            DomainFilter(
 								                allowed_domains=["docs.crawl4ai.com"],
 								                blocked_domains=["old.docs.crawl4ai.com"],
 								            ),
 								            URLPatternFilter(patterns=["*core*", "*advanced*", "*blog*"]),
 								            ContentTypeFilter(allowed_types=["text/html"]),
 								        ]
 								    )
 								    # Create a composite scorer that combines multiple scoring strategies
 								    keyword_scorer = KeywordRelevanceScorer(
 								        keywords=["crawl", "example", "async", "configuration"], weight=0.7
 								    )
 								    # Set up the configuration
 								    config = CrawlerRunConfig(
 								        deep_crawl_strategy=BestFirstCrawlingStrategy(
 								            max_depth=1,
 								            include_external=False,
 								            filter_chain=filter_chain,
 								            url_scorer=keyword_scorer,
 								        ),
 								        scraping_strategy=LXMLWebScrapingStrategy(),
 								        stream=True,
 								        verbose=True,
 								    )
 								    # Execute the crawl
 								    results = []
 								    start_time = time.perf_counter()
 								    async with AsyncWebCrawler() as crawler:
 								        async for result in await crawler.arun(
 								            url="https://docs.crawl4ai.com", config=config
 								        ):
 								            results.append(result)
 								            score = result.metadata.get("score", 0)
 								            depth = result.metadata.get("depth", 0)
 								            print(f"→ Depth: {depth} | Score: {score:.2f} | {result.url}")
 								    duration = time.perf_counter() - start_time
 								    # Summarize the results
 								    print(f"\n✅ Crawled {len(results)} high-value pages in {duration:.2f} seconds")
 								    print(
 								        f"✅ Average score: {sum(r.metadata.get('score', 0) for r in results) / len(results):.2f}"
 								    )
 								    # Group by depth
 								    depth_counts = {}
 								    for result in results:
 								        depth = result.metadata.get("depth", 0)
 								        depth_counts[depth] = depth_counts.get(depth, 0) + 1
 								    print("\n📊 Pages crawled by depth:")
 								    for depth, count in sorted(depth_counts.items()):
 								        print(f"  Depth {depth}: {count} pages")
-												Release prep (#749)

* fix: Update export of URLPatternFilter

* chore: Add dependancy for cchardet in requirements

* docs: Update example for deep crawl in release note for v0.5

* Docs: update the example for memory dispatcher

* docs: updated example for crawl strategies

* Refactor: Removed wrapping in if __name__==main block since this is a markdown file.

* chore: removed cchardet from dependancy list, since unclecode is planning to remove it

* docs: updated the example for proxy rotation to a working example

* feat: Introduced ProxyConfig param

* Add tutorial for deep crawl & update contributor list for bug fixes in feb alpha-1

* chore: update and test new dependancies

* feat:Make PyPDF2 a conditional dependancy

* updated tutorial and release note for v0.5

* docs: update docs for deep crawl, and fix a typo in docker-deployment markdown filename

* refactor: 1. Deprecate markdown_v2 2. Make markdown backward compatible to behave as a string when needed. 3. Fix LlmConfig usage in cli 4. Deprecate markdown_v2 in cli 5. Update AsyncWebCrawler for changes in CrawlResult

* fix: Bug in serialisation of markdown in acache_url

* Refactor: Added deprecation errors for fit_html and fit_markdown directly on markdown. Now access them via markdown

* fix: remove deprecated markdown_v2 from docker

* Refactor: remove deprecated fit_markdown and fit_html from result

* refactor: fix cache retrieval for markdown as a string

* chore: update all docs, examples and tests with deprecation announcements for markdown_v2, fit_html, fit_markdown
											
										
										
											2025-02-28 17:23:35 +05:30
+								async def run_tutorial():
 								    """
 								    Executes all tutorial sections in sequence.
 								    """
 								    print("\n🚀 CRAWL4AI DEEP CRAWLING TUTORIAL 🚀")
 								    print("======================================")
 								    print("This tutorial will walk you through deep crawling techniques,")
 								    print("from basic to advanced, using the Crawl4AI library.")
 								    # Define sections - uncomment to run specific parts during development
 								    tutorial_sections = [
-												fix(docs): correct section numbering in deepcrawl_example.py tutorial

											
										
										
											2025-03-04 20:57:33 +08:00
+								        basic_deep_crawl,
 								        stream_vs_nonstream,
 								        filters_and_scorers,
 								        max_pages_and_thresholds,
-												Release prep (#749)

* fix: Update export of URLPatternFilter

* chore: Add dependancy for cchardet in requirements

* docs: Update example for deep crawl in release note for v0.5

* Docs: update the example for memory dispatcher

* docs: updated example for crawl strategies

* Refactor: Removed wrapping in if __name__==main block since this is a markdown file.

* chore: removed cchardet from dependancy list, since unclecode is planning to remove it

* docs: updated the example for proxy rotation to a working example

* feat: Introduced ProxyConfig param

* Add tutorial for deep crawl & update contributor list for bug fixes in feb alpha-1

* chore: update and test new dependancies

* feat:Make PyPDF2 a conditional dependancy

* updated tutorial and release note for v0.5

* docs: update docs for deep crawl, and fix a typo in docker-deployment markdown filename

* refactor: 1. Deprecate markdown_v2 2. Make markdown backward compatible to behave as a string when needed. 3. Fix LlmConfig usage in cli 4. Deprecate markdown_v2 in cli 5. Update AsyncWebCrawler for changes in CrawlResult

* fix: Bug in serialisation of markdown in acache_url

* Refactor: Added deprecation errors for fit_html and fit_markdown directly on markdown. Now access them via markdown

* fix: remove deprecated markdown_v2 from docker

* Refactor: remove deprecated fit_markdown and fit_html from result

* refactor: fix cache retrieval for markdown as a string

* chore: update all docs, examples and tests with deprecation announcements for markdown_v2, fit_html, fit_markdown
											
										
										
											2025-02-28 17:23:35 +05:30
+								        advanced_filters,
-												fix(docs): correct section numbering in deepcrawl_example.py tutorial

											
										
										
											2025-03-04 20:57:33 +08:00
+								        wrap_up,
-												Release prep (#749)

* fix: Update export of URLPatternFilter

* chore: Add dependancy for cchardet in requirements

* docs: Update example for deep crawl in release note for v0.5

* Docs: update the example for memory dispatcher

* docs: updated example for crawl strategies

* Refactor: Removed wrapping in if __name__==main block since this is a markdown file.

* chore: removed cchardet from dependancy list, since unclecode is planning to remove it

* docs: updated the example for proxy rotation to a working example

* feat: Introduced ProxyConfig param

* Add tutorial for deep crawl & update contributor list for bug fixes in feb alpha-1

* chore: update and test new dependancies

* feat:Make PyPDF2 a conditional dependancy

* updated tutorial and release note for v0.5

* docs: update docs for deep crawl, and fix a typo in docker-deployment markdown filename

* refactor: 1. Deprecate markdown_v2 2. Make markdown backward compatible to behave as a string when needed. 3. Fix LlmConfig usage in cli 4. Deprecate markdown_v2 in cli 5. Update AsyncWebCrawler for changes in CrawlResult

* fix: Bug in serialisation of markdown in acache_url

* Refactor: Added deprecation errors for fit_html and fit_markdown directly on markdown. Now access them via markdown

* fix: remove deprecated markdown_v2 from docker

* Refactor: remove deprecated fit_markdown and fit_html from result

* refactor: fix cache retrieval for markdown as a string

* chore: update all docs, examples and tests with deprecation announcements for markdown_v2, fit_html, fit_markdown
											
										
										
											2025-02-28 17:23:35 +05:30
+								    ]
 								    for section in tutorial_sections:
 								        await section()
 								    print("\n🎉 TUTORIAL COMPLETE! 🎉")
 								    print("You now have a comprehensive understanding of deep crawling with Crawl4AI.")
 								    print("For more information, check out https://docs.crawl4ai.com")
 								# Execute the tutorial when run directly
 								if __name__ == "__main__":
 								    asyncio.run(run_tutorial())