crawl4ai/docs/examples/tutorial_v0.5.py

import asyncio
import time
import re

from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, BrowserConfig, MemoryAdaptiveDispatcher, HTTPCrawlerConfig
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
from crawl4ai.deep_crawling import (
    BestFirstCrawlingStrategy,
    FilterChain,
    URLPatternFilter,
    DomainFilter,
    ContentTypeFilter,
)
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy
from crawl4ai import ProxyConfig
from crawl4ai import RoundRobinProxyStrategy
from crawl4ai.content_filter_strategy import LLMContentFilter
from crawl4ai import DefaultMarkdownGenerator
from crawl4ai import LLMConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy
from pprint import pprint


# 1️⃣ Deep Crawling with Best-First Strategy
async def deep_crawl():
    """
    PART 1: Deep Crawling with Best-First Strategy
    
    This function demonstrates:
    - Using the BestFirstCrawlingStrategy
    - Creating filter chains to narrow down crawl targets
    - Using a scorer to prioritize certain URLs
    - Respecting robots.txt rules
    """
    print("\n===== DEEP CRAWLING =====")
    print("This example shows how to implement deep crawling with filters, scorers, and robots.txt compliance.")
    
    # Create a filter chain to filter urls based on patterns, domains and content type
    filter_chain = FilterChain(
        [
            DomainFilter(
                allowed_domains=["docs.crawl4ai.com"],
                blocked_domains=["old.docs.crawl4ai.com"],
            ),
            URLPatternFilter(patterns=["*core*", "*advanced*"],),
            ContentTypeFilter(allowed_types=["text/html"]),
        ]
    )
    
    # Create a keyword scorer that prioritises the pages with certain keywords first
    keyword_scorer = KeywordRelevanceScorer(
        keywords=["crawl", "example", "async", "configuration"], weight=0.7
    )
    
    # Set up the configuration with robots.txt compliance enabled
    deep_crawl_config = CrawlerRunConfig(
        deep_crawl_strategy=BestFirstCrawlingStrategy(
            max_depth=2,
            include_external=False,
            filter_chain=filter_chain,
            url_scorer=keyword_scorer,
        ),
        scraping_strategy=LXMLWebScrapingStrategy(),
        stream=True,
        verbose=True,
        check_robots_txt=True,  # Enable robots.txt compliance
    )
    
    # Execute the crawl
    async with AsyncWebCrawler() as crawler:
        print("\n📊 Starting deep crawl with Best-First strategy...")
        print("  - Filtering by domain, URL patterns, and content type")
        print("  - Scoring pages based on keyword relevance")
        print("  - Respecting robots.txt rules")
        
        start_time = time.perf_counter()
        results = []
        
        async for result in await crawler.arun(url="https://docs.crawl4ai.com", config=deep_crawl_config):
            # Print each result as it comes in
            depth = result.metadata.get("depth", 0)
            score = result.metadata.get("score", 0)
            print(f"Crawled: {result.url} (Depth: {depth}), score: {score:.2f}")
            results.append(result)
            
        duration = time.perf_counter() - start_time
        
        # Print summary statistics
        print(f"\n✅ Crawled {len(results)} high-value pages in {duration:.2f} seconds")
        
        # Group by depth
        if results:
            depth_counts = {}
            for result in results:
                depth = result.metadata.get("depth", 0)
                depth_counts[depth] = depth_counts.get(depth, 0) + 1
            
            print("\n📊 Pages crawled by depth:")
            for depth, count in sorted(depth_counts.items()):
                print(f"  Depth {depth}: {count} pages")


# 2️⃣ Memory-Adaptive Dispatcher
async def memory_adaptive_dispatcher():
    """
    PART 2: Memory-Adaptive Dispatcher
    
    This function demonstrates:
    - Using MemoryAdaptiveDispatcher to manage system memory
    - Batch and streaming modes with multiple URLs
    """
    print("\n===== MEMORY-ADAPTIVE DISPATCHER =====")
    print("This example shows how to use the memory-adaptive dispatcher for resource management.")
    
    # Configure the dispatcher (optional, defaults are used if not provided)
    dispatcher = MemoryAdaptiveDispatcher(
        memory_threshold_percent=80.0,  # Pause if memory usage exceeds 80%
        check_interval=0.5,  # Check memory every 0.5 seconds
    )
    
    # Test URLs
    urls = [
        "https://docs.crawl4ai.com", 
        "https://github.com/unclecode/crawl4ai"
    ]
    
    async def batch_mode():
        print("\n📊 BATCH MODE:")
        print("  In this mode, all results are collected before being returned.")
        
        async with AsyncWebCrawler() as crawler:
            start_time = time.perf_counter()
            results = await crawler.arun_many(
                urls=urls,
                config=CrawlerRunConfig(stream=False),  # Batch mode
                dispatcher=dispatcher,
            )
            
            print(f"  ✅ Received all {len(results)} results after {time.perf_counter() - start_time:.2f} seconds")
            for result in results:
                print(f"  → {result.url} with status code: {result.status_code}")
    
    async def stream_mode():
        print("\n📊 STREAMING MODE:")
        print("  In this mode, results are processed as they become available.")
        
        async with AsyncWebCrawler() as crawler:
            start_time = time.perf_counter()
            count = 0
            first_result_time = None
            
            async for result in await crawler.arun_many(
                urls=urls,
                config=CrawlerRunConfig(stream=True),  # Stream mode
                dispatcher=dispatcher,
            ):
                count += 1
                current_time = time.perf_counter() - start_time
                
                if count == 1:
                    first_result_time = current_time
                    print(f"  ✅ First result after {first_result_time:.2f} seconds: {result.url}")
                else:
                    print(f"  → Result #{count} after {current_time:.2f} seconds: {result.url}")
            
            print(f"  ✅ Total: {count} results")
            print(f"  ✅ First result: {first_result_time:.2f} seconds")
            print(f"  ✅ All results: {time.perf_counter() - start_time:.2f} seconds")
    
    # Run both examples
    await batch_mode()
    await stream_mode()
    
    print("\n🔍 Key Takeaway: The memory-adaptive dispatcher prevents OOM errors")
    print("  and manages concurrency based on system resources.")


# 3️⃣ HTTP Crawler Strategy
async def http_crawler_strategy():
    """
    PART 3: HTTP Crawler Strategy
    
    This function demonstrates:
    - Using the lightweight HTTP-only crawler
    - Setting custom headers and configurations
    """
    print("\n===== HTTP CRAWLER STRATEGY =====")
    print("This example shows how to use the fast, lightweight HTTP-only crawler.")
    
    # Use the HTTP crawler strategy
    http_config = HTTPCrawlerConfig(
        method="GET",
        headers={"User-Agent": "MyCustomBot/1.0"},
        follow_redirects=True,
        verify_ssl=True
    )
    
    print("\n📊 Initializing HTTP crawler strategy...")
    print("  - Using custom User-Agent: MyCustomBot/1.0")
    print("  - Following redirects: Enabled")
    print("  - Verifying SSL: Enabled")
    
    # Create crawler with HTTP strategy
    async with AsyncWebCrawler(
        crawler_strategy=AsyncHTTPCrawlerStrategy(browser_config=http_config)
    ) as crawler:
        start_time = time.perf_counter()
        result = await crawler.arun("https://example.com")
        duration = time.perf_counter() - start_time
        
        print(f"\n✅ Crawled in {duration:.2f} seconds")
        print(f"✅ Status code: {result.status_code}")
        print(f"✅ Content length: {len(result.html)} bytes")
        
        # Check if there was a redirect
        if result.redirected_url and result.redirected_url != result.url:
            print(f"ℹ️ Redirected from {result.url} to {result.redirected_url}")
    
    print("\n🔍 Key Takeaway: HTTP crawler is faster and more memory-efficient")
    print("  than browser-based crawling for simple pages.")


# 4️⃣ Proxy Rotation
async def proxy_rotation():
    """
    PART 4: Proxy Rotation
    
    This function demonstrates:
    - Setting up a proxy rotation strategy
    - Using multiple proxies in a round-robin fashion
    """
    print("\n===== PROXY ROTATION =====")
    print("This example shows how to implement proxy rotation for distributed crawling.")
    
    # Load proxies and create rotation strategy
    proxies = ProxyConfig.from_env()
    #eg: export PROXIES="ip1:port1:username1:password1,ip2:port2:username2:password2"
    if not proxies:
        print("No proxies found in environment. Set PROXIES env variable!")
        return
        
    proxy_strategy = RoundRobinProxyStrategy(proxies)
    
    # Create configs
    browser_config = BrowserConfig(headless=True, verbose=False)
    run_config = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS,
        proxy_rotation_strategy=proxy_strategy
    )

    async with AsyncWebCrawler(config=browser_config) as crawler:
        urls = ["https://httpbin.org/ip"] * (len(proxies) * 2)  # Test each proxy twice

        print("\n📈 Initializing crawler with proxy rotation...")
        async with AsyncWebCrawler(config=browser_config) as crawler:
            print("\n🚀 Starting batch crawl with proxy rotation...")
            results = await crawler.arun_many(
                urls=urls,
                config=run_config
            )
            for result in results:
                if result.success:
                    ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
                    current_proxy = run_config.proxy_config if run_config.proxy_config else None
                    
                    if current_proxy and ip_match:
                        print(f"URL {result.url}")
                        print(f"Proxy {current_proxy.server} -> Response IP: {ip_match.group(0)}")
                        verified = ip_match.group(0) == current_proxy.ip
                        if verified:
                            print(f"✅ Proxy working! IP matches: {current_proxy.ip}")
                        else:
                            print("❌ Proxy failed or IP mismatch!")
                    print("---")
                else:
                    print(f"❌ Crawl via proxy failed!: {result.error_message}")


# 5️⃣ LLM Content Filter (requires API key)
async def llm_content_filter():
    """
    PART 5: LLM Content Filter
    
    This function demonstrates:
    - Configuring LLM providers via LLMConfig
    - Using LLM to generate focused markdown
    - LLMConfig for configuration
    
    Note: Requires a valid API key for the chosen LLM provider
    """
    print("\n===== LLM CONTENT FILTER =====")
    print("This example shows how to use LLM to generate focused markdown content.")
    print("Note: This example requires an API key. Set it in environment variables.")
    
    # Create LLM configuration
    # Replace with your actual API key or set as environment variable
    llm_config = LLMConfig(
        provider="gemini/gemini-1.5-pro", 
        api_token="env:GEMINI_API_KEY"  # Will read from GEMINI_API_KEY environment variable
    )
    
    print("\n📊 Setting up LLM content filter...")
    print(f"  - Provider: {llm_config.provider}")
    print("  - API token: Using environment variable")
    print("  - Instruction: Extract key concepts and summaries")
    
    # Create markdown generator with LLM filter
    markdown_generator = DefaultMarkdownGenerator(
        content_filter=LLMContentFilter(
            llm_config=llm_config,
            instruction="Extract key concepts and summaries"
        )
    )
    
    config = CrawlerRunConfig(markdown_generator=markdown_generator)
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun("https://docs.crawl4ai.com", config=config)
        pprint(result.markdown.fit_markdown)
        print("\n✅ Generated focused markdown:")


# 6️⃣ PDF Processing
async def pdf_processing():
    """
    PART 6: PDF Processing
    
    This function demonstrates:
    - Using PDFCrawlerStrategy and PDFContentScrapingStrategy
    - Extracting text and metadata from PDFs
    """
    print("\n===== PDF PROCESSING =====")
    print("This example shows how to extract text and metadata from PDF files.")
    
    # Sample PDF URL
    pdf_url = "https://arxiv.org/pdf/2310.06825.pdf"
    
    print("\n📊 Initializing PDF crawler...")
    print(f"  - Target PDF: {pdf_url}")
    print("  - Using PDFCrawlerStrategy and PDFContentScrapingStrategy")
    
    # Create crawler with PDF strategy
    async with AsyncWebCrawler(crawler_strategy=PDFCrawlerStrategy()) as crawler:
        print("\n🚀 Starting PDF processing...")
        
        start_time = time.perf_counter()
        result = await crawler.arun(
            pdf_url,
            config=CrawlerRunConfig(scraping_strategy=PDFContentScrapingStrategy())
        )
        duration = time.perf_counter() - start_time
        
        print(f"\n✅ Processed PDF in {duration:.2f} seconds")
        
        # Show metadata
        print("\n📄 PDF Metadata:")
        if result.metadata:
            for key, value in result.metadata.items():
                if key not in ["html", "text", "markdown"] and value:
                    print(f"  - {key}: {value}")
        else:
            print("  No metadata available")
        
        # Show sample of content
        if result.markdown:
            print("\n📝 PDF Content Sample:")
            content_sample = result.markdown[:500] + "..." if len(result.markdown) > 500 else result.markdown
            print(f"---\n{content_sample}\n---")
        else:
            print("\n⚠️ No content extracted")
    
    print("\n🔍 Key Takeaway: Crawl4AI can now process PDF files")
    print("  to extract both text content and metadata.")


# 7️⃣ LLM Schema Generation (requires API key)
async def llm_schema_generation():
    """
    PART 7: LLM Schema Generation
    
    This function demonstrates:
    - Configuring LLM providers via LLMConfig
    - Using LLM to generate extraction schemas
    - JsonCssExtractionStrategy
    
    Note: Requires a valid API key for the chosen LLM provider
    """
    print("\n===== LLM SCHEMA GENERATION =====")
    print("This example shows how to use LLM to automatically generate extraction schemas.")
    print("Note: This example requires an API key. Set it in environment variables.")
    
    # Sample HTML
    sample_html = """
    <div class="product">
        <h2 class="title">Awesome Gaming Laptop</h2>
        <div class="price">$1,299.99</div>
        <div class="specs">
            <ul>
                <li>16GB RAM</li>
                <li>512GB SSD</li>
                <li>RTX 3080</li>
            </ul>
        </div>
        <div class="rating">4.7/5</div>
    </div>
    """
    print("\n📊 Setting up LLMConfig...")
    # Create LLM configuration
    llm_config = LLMConfig(
        provider="gemini/gemini-1.5-pro", 
        api_token="env:GEMINI_API_KEY"
    )
    print("\n🚀 Generating schema for product extraction...")
    print("  This would use the LLM to analyze HTML and create an extraction schema")
    schema = JsonCssExtractionStrategy.generate_schema(
    html=sample_html,
    llm_config = llm_config,
    query="Extract product name and price"
    )
    print("\n✅ Generated Schema:")
    pprint(schema)
    
# Run all sections
async def run_tutorial():
    """
    Main function to run all tutorial sections.
    """
    print("\n🚀 CRAWL4AI v0.5.0 TUTORIAL 🚀")
    print("===============================")
    print("This tutorial demonstrates the key features of Crawl4AI v0.5.0")
    print("Including deep crawling, memory-adaptive dispatching, advanced filtering,")
    print("and more powerful extraction capabilities.")
    
    # Sections to run
    sections = [
        deep_crawl,                 # 1. Deep Crawling with Best-First Strategy
        memory_adaptive_dispatcher, # 2. Memory-Adaptive Dispatcher
        http_crawler_strategy,      # 3. HTTP Crawler Strategy
        proxy_rotation,             # 4. Proxy Rotation
        llm_content_filter,         # 5. LLM Content Filter
        pdf_processing,             # 6. PDF Processing
        llm_schema_generation,      # 7. Schema Generation using LLM
    ]
    
    for section in sections:
        try:
            await section()
        except Exception as e:
            print(f"⚠️ Error in {section.__name__}: {e}")
    
    print("\n🎉 TUTORIAL COMPLETE! 🎉")
    print("You've now explored the key features of Crawl4AI v0.5.0")
    print("For more information, visit https://docs.crawl4ai.com")


# Run the tutorial
if __name__ == "__main__":
    asyncio.run(run_tutorial())
-												Release prep (#749)

* fix: Update export of URLPatternFilter

* chore: Add dependancy for cchardet in requirements

* docs: Update example for deep crawl in release note for v0.5

* Docs: update the example for memory dispatcher

* docs: updated example for crawl strategies

* Refactor: Removed wrapping in if __name__==main block since this is a markdown file.

* chore: removed cchardet from dependancy list, since unclecode is planning to remove it

* docs: updated the example for proxy rotation to a working example

* feat: Introduced ProxyConfig param

* Add tutorial for deep crawl & update contributor list for bug fixes in feb alpha-1

* chore: update and test new dependancies

* feat:Make PyPDF2 a conditional dependancy

* updated tutorial and release note for v0.5

* docs: update docs for deep crawl, and fix a typo in docker-deployment markdown filename

* refactor: 1. Deprecate markdown_v2 2. Make markdown backward compatible to behave as a string when needed. 3. Fix LlmConfig usage in cli 4. Deprecate markdown_v2 in cli 5. Update AsyncWebCrawler for changes in CrawlResult

* fix: Bug in serialisation of markdown in acache_url

* Refactor: Added deprecation errors for fit_html and fit_markdown directly on markdown. Now access them via markdown

* fix: remove deprecated markdown_v2 from docker

* Refactor: remove deprecated fit_markdown and fit_html from result

* refactor: fix cache retrieval for markdown as a string

* chore: update all docs, examples and tests with deprecation announcements for markdown_v2, fit_html, fit_markdown
											
										
										
											2025-02-28 17:23:35 +05:30
+								import asyncio
 								import time
 								import re
 								from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, BrowserConfig, MemoryAdaptiveDispatcher, HTTPCrawlerConfig
 								from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
 								from crawl4ai.deep_crawling import (
 								    BestFirstCrawlingStrategy,
 								    FilterChain,
 								    URLPatternFilter,
 								    DomainFilter,
 								    ContentTypeFilter,
 								)
 								from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
 								from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy
-												refactor(proxy): move ProxyConfig to async_configs and improve LLM token handling

Moved ProxyConfig class from proxy_strategy.py to async_configs.py for better organization.
Improved LLM token handling with new PROVIDER_MODELS_PREFIXES.
Added test cases for deep crawling and proxy rotation.
Removed docker_config from BrowserConfig as it's handled separately.

BREAKING CHANGE: ProxyConfig import path changed from crawl4ai.proxy_strategy to crawl4ai

											
										
										
											2025-04-15 22:27:18 +08:00
+								from crawl4ai import ProxyConfig
-												Release prep (#749)

* fix: Update export of URLPatternFilter

* chore: Add dependancy for cchardet in requirements

* docs: Update example for deep crawl in release note for v0.5

* Docs: update the example for memory dispatcher

* docs: updated example for crawl strategies

* Refactor: Removed wrapping in if __name__==main block since this is a markdown file.

* chore: removed cchardet from dependancy list, since unclecode is planning to remove it

* docs: updated the example for proxy rotation to a working example

* feat: Introduced ProxyConfig param

* Add tutorial for deep crawl & update contributor list for bug fixes in feb alpha-1

* chore: update and test new dependancies

* feat:Make PyPDF2 a conditional dependancy

* updated tutorial and release note for v0.5

* docs: update docs for deep crawl, and fix a typo in docker-deployment markdown filename

* refactor: 1. Deprecate markdown_v2 2. Make markdown backward compatible to behave as a string when needed. 3. Fix LlmConfig usage in cli 4. Deprecate markdown_v2 in cli 5. Update AsyncWebCrawler for changes in CrawlResult

* fix: Bug in serialisation of markdown in acache_url

* Refactor: Added deprecation errors for fit_html and fit_markdown directly on markdown. Now access them via markdown

* fix: remove deprecated markdown_v2 from docker

* Refactor: remove deprecated fit_markdown and fit_html from result

* refactor: fix cache retrieval for markdown as a string

* chore: update all docs, examples and tests with deprecation announcements for markdown_v2, fit_html, fit_markdown
											
										
										
											2025-02-28 17:23:35 +05:30
+								from crawl4ai import RoundRobinProxyStrategy
 								from crawl4ai.content_filter_strategy import LLMContentFilter
 								from crawl4ai import DefaultMarkdownGenerator
-												feat(browser): add standalone CDP browser launch and lxml extraction strategy

Add new features to enhance browser automation and HTML extraction:
- Add CDP browser launch capability with customizable ports and profiles
- Implement JsonLxmlExtractionStrategy for faster HTML parsing
- Add CLI command 'crwl cdp' for launching standalone CDP browsers
- Support connecting to external CDP browsers via URL
- Optimize selector caching and context-sensitive queries

BREAKING CHANGE: LLMConfig import path changed from crawl4ai.types to crawl4ai

											
										
										
											2025-03-07 20:55:56 +08:00
+								from crawl4ai import LLMConfig
-												Release prep (#749)

* fix: Update export of URLPatternFilter

* chore: Add dependancy for cchardet in requirements

* docs: Update example for deep crawl in release note for v0.5

* Docs: update the example for memory dispatcher

* docs: updated example for crawl strategies

* Refactor: Removed wrapping in if __name__==main block since this is a markdown file.

* chore: removed cchardet from dependancy list, since unclecode is planning to remove it

* docs: updated the example for proxy rotation to a working example

* feat: Introduced ProxyConfig param

* Add tutorial for deep crawl & update contributor list for bug fixes in feb alpha-1

* chore: update and test new dependancies

* feat:Make PyPDF2 a conditional dependancy

* updated tutorial and release note for v0.5

* docs: update docs for deep crawl, and fix a typo in docker-deployment markdown filename

* refactor: 1. Deprecate markdown_v2 2. Make markdown backward compatible to behave as a string when needed. 3. Fix LlmConfig usage in cli 4. Deprecate markdown_v2 in cli 5. Update AsyncWebCrawler for changes in CrawlResult

* fix: Bug in serialisation of markdown in acache_url

* Refactor: Added deprecation errors for fit_html and fit_markdown directly on markdown. Now access them via markdown

* fix: remove deprecated markdown_v2 from docker

* Refactor: remove deprecated fit_markdown and fit_html from result

* refactor: fix cache retrieval for markdown as a string

* chore: update all docs, examples and tests with deprecation announcements for markdown_v2, fit_html, fit_markdown
											
										
										
											2025-02-28 17:23:35 +05:30
+								from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
 								from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy
 								from pprint import pprint
 								# 1️⃣ Deep Crawling with Best-First Strategy
 								async def deep_crawl():
 								    """
 								    PART 1: Deep Crawling with Best-First Strategy
 								    This function demonstrates:
 								    - Using the BestFirstCrawlingStrategy
 								    - Creating filter chains to narrow down crawl targets
 								    - Using a scorer to prioritize certain URLs
 								    - Respecting robots.txt rules
 								    """
 								    print("\n===== DEEP CRAWLING =====")
 								    print("This example shows how to implement deep crawling with filters, scorers, and robots.txt compliance.")
 								    # Create a filter chain to filter urls based on patterns, domains and content type
 								    filter_chain = FilterChain(
 								        [
 								            DomainFilter(
 								                allowed_domains=["docs.crawl4ai.com"],
 								                blocked_domains=["old.docs.crawl4ai.com"],
 								            ),
 								            URLPatternFilter(patterns=["*core*", "*advanced*"],),
 								            ContentTypeFilter(allowed_types=["text/html"]),
 								        ]
 								    )
 								    # Create a keyword scorer that prioritises the pages with certain keywords first
 								    keyword_scorer = KeywordRelevanceScorer(
 								        keywords=["crawl", "example", "async", "configuration"], weight=0.7
 								    )
 								    # Set up the configuration with robots.txt compliance enabled
 								    deep_crawl_config = CrawlerRunConfig(
 								        deep_crawl_strategy=BestFirstCrawlingStrategy(
 								            max_depth=2,
 								            include_external=False,
 								            filter_chain=filter_chain,
 								            url_scorer=keyword_scorer,
 								        ),
 								        scraping_strategy=LXMLWebScrapingStrategy(),
 								        stream=True,
 								        verbose=True,
 								        check_robots_txt=True,  # Enable robots.txt compliance
 								    )
 								    # Execute the crawl
 								    async with AsyncWebCrawler() as crawler:
 								        print("\n📊 Starting deep crawl with Best-First strategy...")
 								        print("  - Filtering by domain, URL patterns, and content type")
 								        print("  - Scoring pages based on keyword relevance")
 								        print("  - Respecting robots.txt rules")
 								        start_time = time.perf_counter()
 								        results = []
 								        async for result in await crawler.arun(url="https://docs.crawl4ai.com", config=deep_crawl_config):
 								            # Print each result as it comes in
 								            depth = result.metadata.get("depth", 0)
 								            score = result.metadata.get("score", 0)
 								            print(f"Crawled: {result.url} (Depth: {depth}), score: {score:.2f}")
 								            results.append(result)
 								        duration = time.perf_counter() - start_time
 								        # Print summary statistics
 								        print(f"\n✅ Crawled {len(results)} high-value pages in {duration:.2f} seconds")
 								        # Group by depth
 								        if results:
 								            depth_counts = {}
 								            for result in results:
 								                depth = result.metadata.get("depth", 0)
 								                depth_counts[depth] = depth_counts.get(depth, 0) + 1
 								            print("\n📊 Pages crawled by depth:")
 								            for depth, count in sorted(depth_counts.items()):
 								                print(f"  Depth {depth}: {count} pages")
 								# 2️⃣ Memory-Adaptive Dispatcher
 								async def memory_adaptive_dispatcher():
 								    """
 								    PART 2: Memory-Adaptive Dispatcher
 								    This function demonstrates:
 								    - Using MemoryAdaptiveDispatcher to manage system memory
 								    - Batch and streaming modes with multiple URLs
 								    """
 								    print("\n===== MEMORY-ADAPTIVE DISPATCHER =====")
 								    print("This example shows how to use the memory-adaptive dispatcher for resource management.")
 								    # Configure the dispatcher (optional, defaults are used if not provided)
 								    dispatcher = MemoryAdaptiveDispatcher(
 								        memory_threshold_percent=80.0,  # Pause if memory usage exceeds 80%
 								        check_interval=0.5,  # Check memory every 0.5 seconds
 								    )
 								    # Test URLs
 								    urls = [
 								        "https://docs.crawl4ai.com",
 								        "https://github.com/unclecode/crawl4ai"
 								    ]
 								    async def batch_mode():
 								        print("\n📊 BATCH MODE:")
 								        print("  In this mode, all results are collected before being returned.")
 								        async with AsyncWebCrawler() as crawler:
 								            start_time = time.perf_counter()
 								            results = await crawler.arun_many(
 								                urls=urls,
 								                config=CrawlerRunConfig(stream=False),  # Batch mode
 								                dispatcher=dispatcher,
 								            )
 								            print(f"  ✅ Received all {len(results)} results after {time.perf_counter() - start_time:.2f} seconds")
 								            for result in results:
 								                print(f"  → {result.url} with status code: {result.status_code}")
 								    async def stream_mode():
 								        print("\n📊 STREAMING MODE:")
 								        print("  In this mode, results are processed as they become available.")
 								        async with AsyncWebCrawler() as crawler:
 								            start_time = time.perf_counter()
 								            count = 0
 								            first_result_time = None
 								            async for result in await crawler.arun_many(
 								                urls=urls,
 								                config=CrawlerRunConfig(stream=True),  # Stream mode
 								                dispatcher=dispatcher,
 								            ):
 								                count += 1
 								                current_time = time.perf_counter() - start_time
 								                if count == 1:
 								                    first_result_time = current_time
 								                    print(f"  ✅ First result after {first_result_time:.2f} seconds: {result.url}")
 								                else:
 								                    print(f"  → Result #{count} after {current_time:.2f} seconds: {result.url}")
 								            print(f"  ✅ Total: {count} results")
 								            print(f"  ✅ First result: {first_result_time:.2f} seconds")
 								            print(f"  ✅ All results: {time.perf_counter() - start_time:.2f} seconds")
 								    # Run both examples
 								    await batch_mode()
 								    await stream_mode()
 								    print("\n🔍 Key Takeaway: The memory-adaptive dispatcher prevents OOM errors")
 								    print("  and manages concurrency based on system resources.")
 								# 3️⃣ HTTP Crawler Strategy
 								async def http_crawler_strategy():
 								    """
 								    PART 3: HTTP Crawler Strategy
 								    This function demonstrates:
 								    - Using the lightweight HTTP-only crawler
 								    - Setting custom headers and configurations
 								    """
 								    print("\n===== HTTP CRAWLER STRATEGY =====")
 								    print("This example shows how to use the fast, lightweight HTTP-only crawler.")
 								    # Use the HTTP crawler strategy
 								    http_config = HTTPCrawlerConfig(
 								        method="GET",
 								        headers={"User-Agent": "MyCustomBot/1.0"},
 								        follow_redirects=True,
 								        verify_ssl=True
 								    )
 								    print("\n📊 Initializing HTTP crawler strategy...")
 								    print("  - Using custom User-Agent: MyCustomBot/1.0")
 								    print("  - Following redirects: Enabled")
 								    print("  - Verifying SSL: Enabled")
 								    # Create crawler with HTTP strategy
 								    async with AsyncWebCrawler(
 								        crawler_strategy=AsyncHTTPCrawlerStrategy(browser_config=http_config)
 								    ) as crawler:
 								        start_time = time.perf_counter()
 								        result = await crawler.arun("https://example.com")
 								        duration = time.perf_counter() - start_time
 								        print(f"\n✅ Crawled in {duration:.2f} seconds")
 								        print(f"✅ Status code: {result.status_code}")
 								        print(f"✅ Content length: {len(result.html)} bytes")
 								        # Check if there was a redirect
 								        if result.redirected_url and result.redirected_url != result.url:
 								            print(f"ℹ️ Redirected from {result.url} to {result.redirected_url}")
 								    print("\n🔍 Key Takeaway: HTTP crawler is faster and more memory-efficient")
 								    print("  than browser-based crawling for simple pages.")
 								# 4️⃣ Proxy Rotation
 								async def proxy_rotation():
 								    """
 								    PART 4: Proxy Rotation
 								    This function demonstrates:
 								    - Setting up a proxy rotation strategy
 								    - Using multiple proxies in a round-robin fashion
 								    """
 								    print("\n===== PROXY ROTATION =====")
 								    print("This example shows how to implement proxy rotation for distributed crawling.")
 								    # Load proxies and create rotation strategy
 								    proxies = ProxyConfig.from_env()
 								    #eg: export PROXIES="ip1:port1:username1:password1,ip2:port2:username2:password2"
 								    if not proxies:
 								        print("No proxies found in environment. Set PROXIES env variable!")
 								        return
 								    proxy_strategy = RoundRobinProxyStrategy(proxies)
 								    # Create configs
 								    browser_config = BrowserConfig(headless=True, verbose=False)
 								    run_config = CrawlerRunConfig(
 								        cache_mode=CacheMode.BYPASS,
 								        proxy_rotation_strategy=proxy_strategy
 								    )
 								    async with AsyncWebCrawler(config=browser_config) as crawler:
 								        urls = ["https://httpbin.org/ip"] * (len(proxies) * 2)  # Test each proxy twice
 								        print("\n📈 Initializing crawler with proxy rotation...")
 								        async with AsyncWebCrawler(config=browser_config) as crawler:
 								            print("\n🚀 Starting batch crawl with proxy rotation...")
 								            results = await crawler.arun_many(
 								                urls=urls,
 								                config=run_config
 								            )
 								            for result in results:
 								                if result.success:
 								                    ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
 								                    current_proxy = run_config.proxy_config if run_config.proxy_config else None
 								                    if current_proxy and ip_match:
 								                        print(f"URL {result.url}")
 								                        print(f"Proxy {current_proxy.server} -> Response IP: {ip_match.group(0)}")
 								                        verified = ip_match.group(0) == current_proxy.ip
 								                        if verified:
 								                            print(f"✅ Proxy working! IP matches: {current_proxy.ip}")
 								                        else:
 								                            print("❌ Proxy failed or IP mismatch!")
 								                    print("---")
 								                else:
 								                    print(f"❌ Crawl via proxy failed!: {result.error_message}")
 								# 5️⃣ LLM Content Filter (requires API key)
 								async def llm_content_filter():
 								    """
 								    PART 5: LLM Content Filter
 								    This function demonstrates:
-												refactor(llm): rename LlmConfig to LLMConfig for consistency

Rename LlmConfig to LLMConfig across the codebase to follow consistent naming conventions.
Update all imports and usages to use the new name.
Update documentation and examples to reflect the change.

BREAKING CHANGE: LlmConfig has been renamed to LLMConfig. Users need to update their imports and usage.

											
										
										
											2025-03-05 14:17:04 +08:00
+								    - Configuring LLM providers via LLMConfig
-												Release prep (#749)

* fix: Update export of URLPatternFilter

* chore: Add dependancy for cchardet in requirements

* docs: Update example for deep crawl in release note for v0.5

* Docs: update the example for memory dispatcher

* docs: updated example for crawl strategies

* Refactor: Removed wrapping in if __name__==main block since this is a markdown file.

* chore: removed cchardet from dependancy list, since unclecode is planning to remove it

* docs: updated the example for proxy rotation to a working example

* feat: Introduced ProxyConfig param

* Add tutorial for deep crawl & update contributor list for bug fixes in feb alpha-1

* chore: update and test new dependancies

* feat:Make PyPDF2 a conditional dependancy

* updated tutorial and release note for v0.5

* docs: update docs for deep crawl, and fix a typo in docker-deployment markdown filename

* refactor: 1. Deprecate markdown_v2 2. Make markdown backward compatible to behave as a string when needed. 3. Fix LlmConfig usage in cli 4. Deprecate markdown_v2 in cli 5. Update AsyncWebCrawler for changes in CrawlResult

* fix: Bug in serialisation of markdown in acache_url

* Refactor: Added deprecation errors for fit_html and fit_markdown directly on markdown. Now access them via markdown

* fix: remove deprecated markdown_v2 from docker

* Refactor: remove deprecated fit_markdown and fit_html from result

* refactor: fix cache retrieval for markdown as a string

* chore: update all docs, examples and tests with deprecation announcements for markdown_v2, fit_html, fit_markdown
											
										
										
											2025-02-28 17:23:35 +05:30
+								    - Using LLM to generate focused markdown
-												refactor(llm): rename LlmConfig to LLMConfig for consistency

Rename LlmConfig to LLMConfig across the codebase to follow consistent naming conventions.
Update all imports and usages to use the new name.
Update documentation and examples to reflect the change.

BREAKING CHANGE: LlmConfig has been renamed to LLMConfig. Users need to update their imports and usage.

											
										
										
											2025-03-05 14:17:04 +08:00
+								    - LLMConfig for configuration
-												Release prep (#749)

* fix: Update export of URLPatternFilter

* chore: Add dependancy for cchardet in requirements

* docs: Update example for deep crawl in release note for v0.5

* Docs: update the example for memory dispatcher

* docs: updated example for crawl strategies

* Refactor: Removed wrapping in if __name__==main block since this is a markdown file.

* chore: removed cchardet from dependancy list, since unclecode is planning to remove it

* docs: updated the example for proxy rotation to a working example

* feat: Introduced ProxyConfig param

* Add tutorial for deep crawl & update contributor list for bug fixes in feb alpha-1

* chore: update and test new dependancies

* feat:Make PyPDF2 a conditional dependancy

* updated tutorial and release note for v0.5

* docs: update docs for deep crawl, and fix a typo in docker-deployment markdown filename

* refactor: 1. Deprecate markdown_v2 2. Make markdown backward compatible to behave as a string when needed. 3. Fix LlmConfig usage in cli 4. Deprecate markdown_v2 in cli 5. Update AsyncWebCrawler for changes in CrawlResult

* fix: Bug in serialisation of markdown in acache_url

* Refactor: Added deprecation errors for fit_html and fit_markdown directly on markdown. Now access them via markdown

* fix: remove deprecated markdown_v2 from docker

* Refactor: remove deprecated fit_markdown and fit_html from result

* refactor: fix cache retrieval for markdown as a string

* chore: update all docs, examples and tests with deprecation announcements for markdown_v2, fit_html, fit_markdown
											
										
										
											2025-02-28 17:23:35 +05:30
 								    Note: Requires a valid API key for the chosen LLM provider
 								    """
 								    print("\n===== LLM CONTENT FILTER =====")
 								    print("This example shows how to use LLM to generate focused markdown content.")
 								    print("Note: This example requires an API key. Set it in environment variables.")
 								    # Create LLM configuration
 								    # Replace with your actual API key or set as environment variable
-												refactor(llm): rename LlmConfig to LLMConfig for consistency

Rename LlmConfig to LLMConfig across the codebase to follow consistent naming conventions.
Update all imports and usages to use the new name.
Update documentation and examples to reflect the change.

BREAKING CHANGE: LlmConfig has been renamed to LLMConfig. Users need to update their imports and usage.

											
										
										
											2025-03-05 14:17:04 +08:00
+								    llm_config = LLMConfig(
-												Release prep (#749)

* fix: Update export of URLPatternFilter

* chore: Add dependancy for cchardet in requirements

* docs: Update example for deep crawl in release note for v0.5

* Docs: update the example for memory dispatcher

* docs: updated example for crawl strategies

* Refactor: Removed wrapping in if __name__==main block since this is a markdown file.

* chore: removed cchardet from dependancy list, since unclecode is planning to remove it

* docs: updated the example for proxy rotation to a working example

* feat: Introduced ProxyConfig param

* Add tutorial for deep crawl & update contributor list for bug fixes in feb alpha-1

* chore: update and test new dependancies

* feat:Make PyPDF2 a conditional dependancy

* updated tutorial and release note for v0.5

* docs: update docs for deep crawl, and fix a typo in docker-deployment markdown filename

* refactor: 1. Deprecate markdown_v2 2. Make markdown backward compatible to behave as a string when needed. 3. Fix LlmConfig usage in cli 4. Deprecate markdown_v2 in cli 5. Update AsyncWebCrawler for changes in CrawlResult

* fix: Bug in serialisation of markdown in acache_url

* Refactor: Added deprecation errors for fit_html and fit_markdown directly on markdown. Now access them via markdown

* fix: remove deprecated markdown_v2 from docker

* Refactor: remove deprecated fit_markdown and fit_html from result

* refactor: fix cache retrieval for markdown as a string

* chore: update all docs, examples and tests with deprecation announcements for markdown_v2, fit_html, fit_markdown
											
										
										
											2025-02-28 17:23:35 +05:30
+								        provider="gemini/gemini-1.5-pro",
 								        api_token="env:GEMINI_API_KEY"  # Will read from GEMINI_API_KEY environment variable
 								    )
 								    print("\n📊 Setting up LLM content filter...")
 								    print(f"  - Provider: {llm_config.provider}")
 								    print("  - API token: Using environment variable")
 								    print("  - Instruction: Extract key concepts and summaries")
 								    # Create markdown generator with LLM filter
 								    markdown_generator = DefaultMarkdownGenerator(
 								        content_filter=LLMContentFilter(
-												refactor(llm): rename LlmConfig to LLMConfig for consistency

Rename LlmConfig to LLMConfig across the codebase to follow consistent naming conventions.
Update all imports and usages to use the new name.
Update documentation and examples to reflect the change.

BREAKING CHANGE: LlmConfig has been renamed to LLMConfig. Users need to update their imports and usage.

											
										
										
											2025-03-05 14:17:04 +08:00
+								            llm_config=llm_config,
-												Release prep (#749)

* fix: Update export of URLPatternFilter

* chore: Add dependancy for cchardet in requirements

* docs: Update example for deep crawl in release note for v0.5

* Docs: update the example for memory dispatcher

* docs: updated example for crawl strategies

* Refactor: Removed wrapping in if __name__==main block since this is a markdown file.

* chore: removed cchardet from dependancy list, since unclecode is planning to remove it

* docs: updated the example for proxy rotation to a working example

* feat: Introduced ProxyConfig param

* Add tutorial for deep crawl & update contributor list for bug fixes in feb alpha-1

* chore: update and test new dependancies

* feat:Make PyPDF2 a conditional dependancy

* updated tutorial and release note for v0.5

* docs: update docs for deep crawl, and fix a typo in docker-deployment markdown filename

* refactor: 1. Deprecate markdown_v2 2. Make markdown backward compatible to behave as a string when needed. 3. Fix LlmConfig usage in cli 4. Deprecate markdown_v2 in cli 5. Update AsyncWebCrawler for changes in CrawlResult

* fix: Bug in serialisation of markdown in acache_url

* Refactor: Added deprecation errors for fit_html and fit_markdown directly on markdown. Now access them via markdown

* fix: remove deprecated markdown_v2 from docker

* Refactor: remove deprecated fit_markdown and fit_html from result

* refactor: fix cache retrieval for markdown as a string

* chore: update all docs, examples and tests with deprecation announcements for markdown_v2, fit_html, fit_markdown
											
										
										
											2025-02-28 17:23:35 +05:30
+								            instruction="Extract key concepts and summaries"
 								        )
 								    )
 								    config = CrawlerRunConfig(markdown_generator=markdown_generator)
 								    async with AsyncWebCrawler() as crawler:
 								        result = await crawler.arun("https://docs.crawl4ai.com", config=config)
 								        pprint(result.markdown.fit_markdown)
 								        print("\n✅ Generated focused markdown:")
 								# 6️⃣ PDF Processing
 								async def pdf_processing():
 								    """
 								    PART 6: PDF Processing
 								    This function demonstrates:
 								    - Using PDFCrawlerStrategy and PDFContentScrapingStrategy
 								    - Extracting text and metadata from PDFs
 								    """
 								    print("\n===== PDF PROCESSING =====")
 								    print("This example shows how to extract text and metadata from PDF files.")
 								    # Sample PDF URL
 								    pdf_url = "https://arxiv.org/pdf/2310.06825.pdf"
 								    print("\n📊 Initializing PDF crawler...")
 								    print(f"  - Target PDF: {pdf_url}")
 								    print("  - Using PDFCrawlerStrategy and PDFContentScrapingStrategy")
 								    # Create crawler with PDF strategy
 								    async with AsyncWebCrawler(crawler_strategy=PDFCrawlerStrategy()) as crawler:
 								        print("\n🚀 Starting PDF processing...")
 								        start_time = time.perf_counter()
 								        result = await crawler.arun(
 								            pdf_url,
 								            config=CrawlerRunConfig(scraping_strategy=PDFContentScrapingStrategy())
 								        )
 								        duration = time.perf_counter() - start_time
 								        print(f"\n✅ Processed PDF in {duration:.2f} seconds")
 								        # Show metadata
 								        print("\n📄 PDF Metadata:")
 								        if result.metadata:
 								            for key, value in result.metadata.items():
 								                if key not in ["html", "text", "markdown"] and value:
 								                    print(f"  - {key}: {value}")
 								        else:
 								            print("  No metadata available")
 								        # Show sample of content
 								        if result.markdown:
 								            print("\n📝 PDF Content Sample:")
 								            content_sample = result.markdown[:500] + "..." if len(result.markdown) > 500 else result.markdown
 								            print(f"---\n{content_sample}\n---")
 								        else:
 								            print("\n⚠️ No content extracted")
 								    print("\n🔍 Key Takeaway: Crawl4AI can now process PDF files")
 								    print("  to extract both text content and metadata.")
 								# 7️⃣ LLM Schema Generation (requires API key)
 								async def llm_schema_generation():
 								    """
 								    PART 7: LLM Schema Generation
 								    This function demonstrates:
-												refactor(llm): rename LlmConfig to LLMConfig for consistency

Rename LlmConfig to LLMConfig across the codebase to follow consistent naming conventions.
Update all imports and usages to use the new name.
Update documentation and examples to reflect the change.

BREAKING CHANGE: LlmConfig has been renamed to LLMConfig. Users need to update their imports and usage.

											
										
										
											2025-03-05 14:17:04 +08:00
+								    - Configuring LLM providers via LLMConfig
-												Release prep (#749)

* fix: Update export of URLPatternFilter

* chore: Add dependancy for cchardet in requirements

* docs: Update example for deep crawl in release note for v0.5

* Docs: update the example for memory dispatcher

* docs: updated example for crawl strategies

* Refactor: Removed wrapping in if __name__==main block since this is a markdown file.

* chore: removed cchardet from dependancy list, since unclecode is planning to remove it

* docs: updated the example for proxy rotation to a working example

* feat: Introduced ProxyConfig param

* Add tutorial for deep crawl & update contributor list for bug fixes in feb alpha-1

* chore: update and test new dependancies

* feat:Make PyPDF2 a conditional dependancy

* updated tutorial and release note for v0.5

* docs: update docs for deep crawl, and fix a typo in docker-deployment markdown filename

* refactor: 1. Deprecate markdown_v2 2. Make markdown backward compatible to behave as a string when needed. 3. Fix LlmConfig usage in cli 4. Deprecate markdown_v2 in cli 5. Update AsyncWebCrawler for changes in CrawlResult

* fix: Bug in serialisation of markdown in acache_url

* Refactor: Added deprecation errors for fit_html and fit_markdown directly on markdown. Now access them via markdown

* fix: remove deprecated markdown_v2 from docker

* Refactor: remove deprecated fit_markdown and fit_html from result

* refactor: fix cache retrieval for markdown as a string

* chore: update all docs, examples and tests with deprecation announcements for markdown_v2, fit_html, fit_markdown
											
										
										
											2025-02-28 17:23:35 +05:30
+								    - Using LLM to generate extraction schemas
 								    - JsonCssExtractionStrategy
 								    Note: Requires a valid API key for the chosen LLM provider
 								    """
 								    print("\n===== LLM SCHEMA GENERATION =====")
 								    print("This example shows how to use LLM to automatically generate extraction schemas.")
 								    print("Note: This example requires an API key. Set it in environment variables.")
 								    # Sample HTML
 								    sample_html = """
 								    <div class="product">
 								        <h2 class="title">Awesome Gaming Laptop</h2>
 								        <div class="price">$1,299.99</div>
 								        <div class="specs">
 								            <ul>
 								                <li>16GB RAM</li>
 								                <li>512GB SSD</li>
 								                <li>RTX 3080</li>
 								            </ul>
 								        </div>
 								        <div class="rating">4.7/5</div>
 								    </div>
 								    """
-												refactor(llm): rename LlmConfig to LLMConfig for consistency

Rename LlmConfig to LLMConfig across the codebase to follow consistent naming conventions.
Update all imports and usages to use the new name.
Update documentation and examples to reflect the change.

BREAKING CHANGE: LlmConfig has been renamed to LLMConfig. Users need to update their imports and usage.

											
										
										
											2025-03-05 14:17:04 +08:00
+								    print("\n📊 Setting up LLMConfig...")
-												Release prep (#749)

* fix: Update export of URLPatternFilter

* chore: Add dependancy for cchardet in requirements

* docs: Update example for deep crawl in release note for v0.5

* Docs: update the example for memory dispatcher

* docs: updated example for crawl strategies

* Refactor: Removed wrapping in if __name__==main block since this is a markdown file.

* chore: removed cchardet from dependancy list, since unclecode is planning to remove it

* docs: updated the example for proxy rotation to a working example

* feat: Introduced ProxyConfig param

* Add tutorial for deep crawl & update contributor list for bug fixes in feb alpha-1

* chore: update and test new dependancies

* feat:Make PyPDF2 a conditional dependancy

* updated tutorial and release note for v0.5

* docs: update docs for deep crawl, and fix a typo in docker-deployment markdown filename

* refactor: 1. Deprecate markdown_v2 2. Make markdown backward compatible to behave as a string when needed. 3. Fix LlmConfig usage in cli 4. Deprecate markdown_v2 in cli 5. Update AsyncWebCrawler for changes in CrawlResult

* fix: Bug in serialisation of markdown in acache_url

* Refactor: Added deprecation errors for fit_html and fit_markdown directly on markdown. Now access them via markdown

* fix: remove deprecated markdown_v2 from docker

* Refactor: remove deprecated fit_markdown and fit_html from result

* refactor: fix cache retrieval for markdown as a string

* chore: update all docs, examples and tests with deprecation announcements for markdown_v2, fit_html, fit_markdown
											
										
										
											2025-02-28 17:23:35 +05:30
+								    # Create LLM configuration
-												refactor(llm): rename LlmConfig to LLMConfig for consistency

Rename LlmConfig to LLMConfig across the codebase to follow consistent naming conventions.
Update all imports and usages to use the new name.
Update documentation and examples to reflect the change.

BREAKING CHANGE: LlmConfig has been renamed to LLMConfig. Users need to update their imports and usage.

											
										
										
											2025-03-05 14:17:04 +08:00
+								    llm_config = LLMConfig(
-												Release prep (#749)

* fix: Update export of URLPatternFilter

* chore: Add dependancy for cchardet in requirements

* docs: Update example for deep crawl in release note for v0.5

* Docs: update the example for memory dispatcher

* docs: updated example for crawl strategies

* Refactor: Removed wrapping in if __name__==main block since this is a markdown file.

* chore: removed cchardet from dependancy list, since unclecode is planning to remove it

* docs: updated the example for proxy rotation to a working example

* feat: Introduced ProxyConfig param

* Add tutorial for deep crawl & update contributor list for bug fixes in feb alpha-1

* chore: update and test new dependancies

* feat:Make PyPDF2 a conditional dependancy

* updated tutorial and release note for v0.5

* docs: update docs for deep crawl, and fix a typo in docker-deployment markdown filename

* refactor: 1. Deprecate markdown_v2 2. Make markdown backward compatible to behave as a string when needed. 3. Fix LlmConfig usage in cli 4. Deprecate markdown_v2 in cli 5. Update AsyncWebCrawler for changes in CrawlResult

* fix: Bug in serialisation of markdown in acache_url

* Refactor: Added deprecation errors for fit_html and fit_markdown directly on markdown. Now access them via markdown

* fix: remove deprecated markdown_v2 from docker

* Refactor: remove deprecated fit_markdown and fit_html from result

* refactor: fix cache retrieval for markdown as a string

* chore: update all docs, examples and tests with deprecation announcements for markdown_v2, fit_html, fit_markdown
											
										
										
											2025-02-28 17:23:35 +05:30
+								        provider="gemini/gemini-1.5-pro",
 								        api_token="env:GEMINI_API_KEY"
 								    )
 								    print("\n🚀 Generating schema for product extraction...")
 								    print("  This would use the LLM to analyze HTML and create an extraction schema")
 								    schema = JsonCssExtractionStrategy.generate_schema(
 								    html=sample_html,
-												refactor(llm): rename LlmConfig to LLMConfig for consistency

Rename LlmConfig to LLMConfig across the codebase to follow consistent naming conventions.
Update all imports and usages to use the new name.
Update documentation and examples to reflect the change.

BREAKING CHANGE: LlmConfig has been renamed to LLMConfig. Users need to update their imports and usage.

											
										
										
											2025-03-05 14:17:04 +08:00
+								    llm_config = llm_config,
-												Release prep (#749)

* fix: Update export of URLPatternFilter

* chore: Add dependancy for cchardet in requirements

* docs: Update example for deep crawl in release note for v0.5

* Docs: update the example for memory dispatcher

* docs: updated example for crawl strategies

* Refactor: Removed wrapping in if __name__==main block since this is a markdown file.

* chore: removed cchardet from dependancy list, since unclecode is planning to remove it

* docs: updated the example for proxy rotation to a working example

* feat: Introduced ProxyConfig param

* Add tutorial for deep crawl & update contributor list for bug fixes in feb alpha-1

* chore: update and test new dependancies

* feat:Make PyPDF2 a conditional dependancy

* updated tutorial and release note for v0.5

* docs: update docs for deep crawl, and fix a typo in docker-deployment markdown filename

* refactor: 1. Deprecate markdown_v2 2. Make markdown backward compatible to behave as a string when needed. 3. Fix LlmConfig usage in cli 4. Deprecate markdown_v2 in cli 5. Update AsyncWebCrawler for changes in CrawlResult

* fix: Bug in serialisation of markdown in acache_url

* Refactor: Added deprecation errors for fit_html and fit_markdown directly on markdown. Now access them via markdown

* fix: remove deprecated markdown_v2 from docker

* Refactor: remove deprecated fit_markdown and fit_html from result

* refactor: fix cache retrieval for markdown as a string

* chore: update all docs, examples and tests with deprecation announcements for markdown_v2, fit_html, fit_markdown
											
										
										
											2025-02-28 17:23:35 +05:30
+								    query="Extract product name and price"
 								    )
 								    print("\n✅ Generated Schema:")
 								    pprint(schema)
 								# Run all sections
 								async def run_tutorial():
 								    """
 								    Main function to run all tutorial sections.
 								    """
 								    print("\n🚀 CRAWL4AI v0.5.0 TUTORIAL 🚀")
 								    print("===============================")
 								    print("This tutorial demonstrates the key features of Crawl4AI v0.5.0")
 								    print("Including deep crawling, memory-adaptive dispatching, advanced filtering,")
 								    print("and more powerful extraction capabilities.")
 								    # Sections to run
 								    sections = [
 								        deep_crawl,                 # 1. Deep Crawling with Best-First Strategy
 								        memory_adaptive_dispatcher, # 2. Memory-Adaptive Dispatcher
 								        http_crawler_strategy,      # 3. HTTP Crawler Strategy
 								        proxy_rotation,             # 4. Proxy Rotation
 								        llm_content_filter,         # 5. LLM Content Filter
 								        pdf_processing,             # 6. PDF Processing
 								        llm_schema_generation,      # 7. Schema Generation using LLM
 								    ]
 								    for section in sections:
 								        try:
 								            await section()
 								        except Exception as e:
 								            print(f"⚠️ Error in {section.__name__}: {e}")
 								    print("\n🎉 TUTORIAL COMPLETE! 🎉")
 								    print("You've now explored the key features of Crawl4AI v0.5.0")
 								    print("For more information, visit https://docs.crawl4ai.com")
 								# Run the tutorial
 								if __name__ == "__main__":
 								    asyncio.run(run_tutorial())