crawl4ai/docs/examples/v0_4_3_features_demo.py

"""
Crawl4ai v0.4.3 Features Demo
============================

This example demonstrates the major new features introduced in Crawl4ai v0.4.3.
Each section showcases a specific feature with practical examples and explanations.
"""

import asyncio
import os
from crawl4ai import *


async def demo_memory_dispatcher():
    """
    1. Memory Dispatcher System Demo
    ===============================
    Shows how to use the new memory dispatcher with monitoring
    """
    print("\n=== 1. Memory Dispatcher System Demo ===")

    # Configure crawler
    browser_config = BrowserConfig(headless=True, verbose=True)
    crawler_config = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS, markdown_generator=DefaultMarkdownGenerator()
    )

    # Test URLs
    urls = ["http://example.com", "http://example.org", "http://example.net"] * 3

    async with AsyncWebCrawler(config=browser_config) as crawler:
        # Initialize dispatcher with monitoring
        monitor = CrawlerMonitor(
            max_visible_rows=10,
            display_mode=DisplayMode.DETAILED,  # Can be DETAILED or AGGREGATED
        )

        dispatcher = MemoryAdaptiveDispatcher(
            memory_threshold_percent=80.0,  # Memory usage threshold
            check_interval=0.5,  # How often to check memory
            max_session_permit=5,  # Max concurrent crawls
            monitor=monitor,  # Pass the monitor
        )

        # Run with memory monitoring
        print("Starting batch crawl with memory monitoring...")
        results = await dispatcher.run_urls(
            urls=urls,
            crawler=crawler,
            config=crawler_config,
        )
        print(f"Completed {len(results)} URLs")


async def demo_streaming_support():
    """
    2. Streaming Support Demo
    ======================
    Shows how to process URLs as they complete using streaming
    """
    print("\n=== 2. Streaming Support Demo ===")

    browser_config = BrowserConfig(headless=True, verbose=True)
    crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, stream=True)

    # Test URLs
    urls = ["http://example.com", "http://example.org", "http://example.net"] * 2

    async with AsyncWebCrawler(config=browser_config) as crawler:
        # Initialize dispatcher for streaming
        dispatcher = MemoryAdaptiveDispatcher(max_session_permit=3, check_interval=0.5)

        print("Starting streaming crawl...")
        async for result in dispatcher.run_urls_stream(
            urls=urls, crawler=crawler, config=crawler_config
        ):
            # Process each result as it arrives
            print(
                f"Received result for {result.url} - Success: {result.result.success}"
            )
            if result.result.success:
                print(f"Content length: {len(result.result.markdown)}")


async def demo_content_scraping():
    """
    3. Content Scraping Strategy Demo
    ==============================
    Demonstrates the new LXMLWebScrapingStrategy for faster content scraping.
    """
    print("\n=== 3. Content Scraping Strategy Demo ===")

    crawler = AsyncWebCrawler()
    url = "https://example.com/article"

    # Configure with the new LXML strategy
    config = CrawlerRunConfig(scraping_strategy=LXMLWebScrapingStrategy(), verbose=True)

    print("Scraping content with LXML strategy...")
    async with crawler:
        result = await crawler.arun(url, config=config)
        if result.success:
            print("Successfully scraped content using LXML strategy")


async def demo_llm_markdown():
    """
    4. LLM-Powered Markdown Generation Demo
    ===================================
    Shows how to use the new LLM-powered content filtering and markdown generation.
    """
    print("\n=== 4. LLM-Powered Markdown Generation Demo ===")

    crawler = AsyncWebCrawler()
    url = "https://docs.python.org/3/tutorial/classes.html"

    content_filter = LLMContentFilter(
        provider="openai/gpt-4o",
        api_token=os.getenv("OPENAI_API_KEY"),
        instruction="""
        Focus on extracting the core educational content about Python classes.
        Include:
        - Key concepts and their explanations
        - Important code examples
        - Essential technical details
        Exclude:
        - Navigation elements
        - Sidebars
        - Footer content
        - Version information
        - Any non-essential UI elements
        
        Format the output as clean markdown with proper code blocks and headers.
        """,
        verbose=True,
    )

    # Configure LLM-powered markdown generation
    config = CrawlerRunConfig(
        markdown_generator=DefaultMarkdownGenerator(
            content_filter=content_filter
        ), 
        cache_mode = CacheMode.BYPASS,
        verbose=True
    )

    print("Generating focused markdown with LLM...")
    async with crawler:
        result = await crawler.arun(url, config=config)
        if result.success and result.markdown_v2:
            print("Successfully generated LLM-filtered markdown")
            print("First 500 chars of filtered content:")
            print(result.markdown_v2.fit_markdown[:500])
            print("Successfully generated LLM-filtered markdown")


async def demo_robots_compliance():
    """
    5. Robots.txt Compliance Demo
    ==========================
    Demonstrates the new robots.txt compliance feature with SQLite caching.
    """
    print("\n=== 5. Robots.txt Compliance Demo ===")

    crawler = AsyncWebCrawler()
    urls = ["https://example.com", "https://facebook.com", "https://twitter.com"]

    # Enable robots.txt checking
    config = CrawlerRunConfig(check_robots_txt=True, verbose=True)

    print("Crawling with robots.txt compliance...")
    async with crawler:
        results = await crawler.arun_many(urls, config=config)
        for result in results:
            if result.status_code == 403:
                print(f"Access blocked by robots.txt: {result.url}")
            elif result.success:
                print(f"Successfully crawled: {result.url}")


async def demo_llm_schema_generation():
    """
    7. LLM-Powered Schema Generation Demo
    =================================
    Demonstrates automatic CSS and XPath schema generation using LLM models.
    """
    print("\n=== 7. LLM-Powered Schema Generation Demo ===")

    # Example HTML content for a job listing
    html_content = """
    <div class="job-listing">
        <h1 class="job-title">Senior Software Engineer</h1>
        <div class="job-details">
            <span class="location">San Francisco, CA</span>
            <span class="salary">$150,000 - $200,000</span>
            <div class="requirements">
                <h2>Requirements</h2>
                <ul>
                    <li>5+ years Python experience</li>
                    <li>Strong background in web crawling</li>
                </ul>
            </div>
        </div>
    </div>
    """

    print("Generating CSS selectors schema...")
    # Generate CSS selectors with a specific query
    css_schema = JsonCssExtractionStrategy.generate_schema(
        html_content,
        schema_type="CSS",
        query="Extract job title, location, and salary information",
        provider="openai/gpt-4o",  # or use other providers like "ollama"
    )
    print("\nGenerated CSS Schema:")
    print(css_schema)

    # Example of using the generated schema with crawler
    crawler = AsyncWebCrawler()
    url = "https://example.com/job-listing"

    # Create an extraction strategy with the generated schema
    extraction_strategy = JsonCssExtractionStrategy(schema=css_schema)

    config = CrawlerRunConfig(extraction_strategy=extraction_strategy, verbose=True)

    print("\nTesting generated schema with crawler...")
    async with crawler:
        result = await crawler.arun(url, config=config)
        if result.success:
            print(json.dumps(result.extracted_content, indent=2) if result.extracted_content else None)
            print("Successfully used generated schema for crawling")


async def get_next_proxy(proxy_file: str = f"proxies.txt") -> Optional[Dict]:
    """Get next proxy from local file"""
    try:
        with open(proxy_file) as f:
            proxies = f.read().splitlines()
            if not proxies:
                return None
            
        ip, port, username, password = random.choice(proxies).split(":")
        return {
            "server": f"http://{ip}:{port}",
            "username": username,
            "password": password,
            "ip": ip  # Store original IP for verification
        }
    except Exception as e:
        print(f"Error loading proxy: {e}")
        return None

async def demo_proxy_rotation():
    """
    8. Proxy Rotation Demo
    ===================
    Demonstrates how to rotate proxies for each request using Crawl4ai.
    """
    print("\n=== 8. Proxy Rotation Demo ===")
    
    
    # Create 10 test requests to httpbin
    urls = ["https://httpbin.org/ip"] * 3
    
    browser_config = BrowserConfig(headless=True)
    run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
    
    async with AsyncWebCrawler(config=browser_config) as crawler:
        for url in urls:
            proxy = await get_next_proxy()
            if not proxy:
                print("No proxy available, skipping...")
                continue
                
            # Create new config with proxy
            current_config = run_config.clone(proxy_config=proxy)
            result = await crawler.arun(url=url, config=current_config)
            
            if result.success:
                ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
                print(f"Proxy {proxy['ip']} -> Response IP: {ip_match.group(0) if ip_match else 'Not found'}")
                verified = ip_match.group(0) == proxy['ip']
                if verified:
                    print(f"✅ Proxy working! IP matches: {proxy['ip']}")
                else:
                    print(f"❌ Proxy failed or IP mismatch!")
            else:
                print(f"Failed with proxy {proxy['ip']}")

if __name__ == "__main__":

async def main():
    """Run all feature demonstrations."""
    demo_memory_dispatcher(),
    print("\n" + "=" * 50 + "\n")
    demo_streaming_support(),
    print("\n" + "=" * 50 + "\n")
    demo_content_scraping(),
    print("\n" + "=" * 50 + "\n")
    demo_llm_schema_generation(),
    print("\n" + "=" * 50 + "\n")
    demo_llm_markdown(),
    print("\n" + "=" * 50 + "\n")
    demo_robots_compliance(),
    print("\n" + "=" * 50 + "\n")
    demo_proxy_rotation()
    print("\n" + "=" * 50 + "\n")

if __name__ == "__main__":
    asyncio.run(main())
feat(release): prepare v0.4.3 beta release Prepare the v0.4.3 beta release with major feature additions and improvements: - Add JsonXPathExtractionStrategy and LLMContentFilter to exports - Update version to 0.4.3b1 - Improve documentation for dispatchers and markdown generation - Update development status to Beta - Reorganize changelog format BREAKING CHANGE: Memory threshold in MemoryAdaptiveDispatcher increased to 90% and SemaphoreDispatcher parameter renamed to max_session_permit 2025-01-21 21:03:11 +08:00			`"""`
			`Crawl4ai v0.4.3 Features Demo`
			`============================`

			`This example demonstrates the major new features introduced in Crawl4ai v0.4.3.`
			`Each section showcases a specific feature with practical examples and explanations.`
			`"""`

			`import asyncio`
			`import os`
			`from crawl4ai import *`


			`async def demo_memory_dispatcher():`
			`"""`
			`1. Memory Dispatcher System Demo`
			`===============================`
			`Shows how to use the new memory dispatcher with monitoring`
			`"""`
			`print("\n=== 1. Memory Dispatcher System Demo ===")`

			`# Configure crawler`
			`browser_config = BrowserConfig(headless=True, verbose=True)`
			`crawler_config = CrawlerRunConfig(`
			`cache_mode=CacheMode.BYPASS, markdown_generator=DefaultMarkdownGenerator()`
			`)`

			`# Test URLs`
			`urls = ["http://example.com", "http://example.org", "http://example.net"] * 3`

			`async with AsyncWebCrawler(config=browser_config) as crawler:`
			`# Initialize dispatcher with monitoring`
			`monitor = CrawlerMonitor(`
			`max_visible_rows=10,`
			`display_mode=DisplayMode.DETAILED, # Can be DETAILED or AGGREGATED`
			`)`

			`dispatcher = MemoryAdaptiveDispatcher(`
			`memory_threshold_percent=80.0, # Memory usage threshold`
			`check_interval=0.5, # How often to check memory`
			`max_session_permit=5, # Max concurrent crawls`
			`monitor=monitor, # Pass the monitor`
			`)`

			`# Run with memory monitoring`
			`print("Starting batch crawl with memory monitoring...")`
			`results = await dispatcher.run_urls(`
			`urls=urls,`
			`crawler=crawler,`
			`config=crawler_config,`
			`)`
			`print(f"Completed {len(results)} URLs")`


			`async def demo_streaming_support():`
			`"""`
			`2. Streaming Support Demo`
			`======================`
			`Shows how to process URLs as they complete using streaming`
			`"""`
			`print("\n=== 2. Streaming Support Demo ===")`

			`browser_config = BrowserConfig(headless=True, verbose=True)`
			`crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, stream=True)`

			`# Test URLs`
			`urls = ["http://example.com", "http://example.org", "http://example.net"] * 2`

			`async with AsyncWebCrawler(config=browser_config) as crawler:`
			`# Initialize dispatcher for streaming`
			`dispatcher = MemoryAdaptiveDispatcher(max_session_permit=3, check_interval=0.5)`

			`print("Starting streaming crawl...")`
			`async for result in dispatcher.run_urls_stream(`
			`urls=urls, crawler=crawler, config=crawler_config`
			`):`
			`# Process each result as it arrives`
			`print(`
			`f"Received result for {result.url} - Success: {result.result.success}"`
			`)`
			`if result.result.success:`
			`print(f"Content length: {len(result.result.markdown)}")`


			`async def demo_content_scraping():`
			`"""`
			`3. Content Scraping Strategy Demo`
			`==============================`
			`Demonstrates the new LXMLWebScrapingStrategy for faster content scraping.`
			`"""`
			`print("\n=== 3. Content Scraping Strategy Demo ===")`

			`crawler = AsyncWebCrawler()`
			`url = "https://example.com/article"`

			`# Configure with the new LXML strategy`
			`config = CrawlerRunConfig(scraping_strategy=LXMLWebScrapingStrategy(), verbose=True)`

			`print("Scraping content with LXML strategy...")`
			`async with crawler:`
			`result = await crawler.arun(url, config=config)`
			`if result.success:`
			`print("Successfully scraped content using LXML strategy")`


			`async def demo_llm_markdown():`
			`"""`
			`4. LLM-Powered Markdown Generation Demo`
			`===================================`
			`Shows how to use the new LLM-powered content filtering and markdown generation.`
			`"""`
			`print("\n=== 4. LLM-Powered Markdown Generation Demo ===")`

			`crawler = AsyncWebCrawler()`
			`url = "https://docs.python.org/3/tutorial/classes.html"`

			`content_filter = LLMContentFilter(`
			`provider="openai/gpt-4o",`
			`api_token=os.getenv("OPENAI_API_KEY"),`
			`instruction="""`
			`Focus on extracting the core educational content about Python classes.`
			`Include:`
			`- Key concepts and their explanations`
			`- Important code examples`
			`- Essential technical details`
			`Exclude:`
			`- Navigation elements`
			`- Sidebars`
			`- Footer content`
			`- Version information`
			`- Any non-essential UI elements`

			`Format the output as clean markdown with proper code blocks and headers.`
			`""",`
			`verbose=True,`
			`)`

			`# Configure LLM-powered markdown generation`
			`config = CrawlerRunConfig(`
			`markdown_generator=DefaultMarkdownGenerator(`
			`content_filter=content_filter`
			`),`
			`cache_mode = CacheMode.BYPASS,`
			`verbose=True`
			`)`

			`print("Generating focused markdown with LLM...")`
			`async with crawler:`
			`result = await crawler.arun(url, config=config)`
			`if result.success and result.markdown_v2:`
			`print("Successfully generated LLM-filtered markdown")`
			`print("First 500 chars of filtered content:")`
			`print(result.markdown_v2.fit_markdown[:500])`
			`print("Successfully generated LLM-filtered markdown")`


			`async def demo_robots_compliance():`
			`"""`
			`5. Robots.txt Compliance Demo`
			`==========================`
			`Demonstrates the new robots.txt compliance feature with SQLite caching.`
			`"""`
			`print("\n=== 5. Robots.txt Compliance Demo ===")`

			`crawler = AsyncWebCrawler()`
			`urls = ["https://example.com", "https://facebook.com", "https://twitter.com"]`

			`# Enable robots.txt checking`
			`config = CrawlerRunConfig(check_robots_txt=True, verbose=True)`

			`print("Crawling with robots.txt compliance...")`
			`async with crawler:`
			`results = await crawler.arun_many(urls, config=config)`
			`for result in results:`
			`if result.status_code == 403:`
			`print(f"Access blocked by robots.txt: {result.url}")`
			`elif result.success:`
			`print(f"Successfully crawled: {result.url}")`



			`async def demo_llm_schema_generation():`
			`"""`
			`7. LLM-Powered Schema Generation Demo`
			`=================================`
			`Demonstrates automatic CSS and XPath schema generation using LLM models.`
			`"""`
			`print("\n=== 7. LLM-Powered Schema Generation Demo ===")`

			`# Example HTML content for a job listing`
			`html_content = """`
			`<div class="job-listing">`
			`<h1 class="job-title">Senior Software Engineer</h1>`
			`<div class="job-details">`
			`<span class="location">San Francisco, CA</span>`
			`<span class="salary">$150,000 - $200,000</span>`
			`<div class="requirements">`
			`<h2>Requirements</h2>`
			`<ul>`
			`<li>5+ years Python experience</li>`
			`<li>Strong background in web crawling</li>`
			`</ul>`
			`</div>`
			`</div>`
			`</div>`
			`"""`

			`print("Generating CSS selectors schema...")`
			`# Generate CSS selectors with a specific query`
			`css_schema = JsonCssExtractionStrategy.generate_schema(`
			`html_content,`
			`schema_type="CSS",`
			`query="Extract job title, location, and salary information",`
			`provider="openai/gpt-4o", # or use other providers like "ollama"`
			`)`
			`print("\nGenerated CSS Schema:")`
			`print(css_schema)`

			`# Example of using the generated schema with crawler`
			`crawler = AsyncWebCrawler()`
			`url = "https://example.com/job-listing"`

			`# Create an extraction strategy with the generated schema`
			`extraction_strategy = JsonCssExtractionStrategy(schema=css_schema)`

			`config = CrawlerRunConfig(extraction_strategy=extraction_strategy, verbose=True)`

			`print("\nTesting generated schema with crawler...")`
			`async with crawler:`
			`result = await crawler.arun(url, config=config)`
			`if result.success:`
			`print(json.dumps(result.extracted_content, indent=2) if result.extracted_content else None)`
			`print("Successfully used generated schema for crawling")`


feat(proxy): add proxy rotation support and documentation Implements dynamic proxy rotation functionality with authentication support and IP verification. Updates include: - Added proxy rotation demo in features example - Updated proxy configuration handling in BrowserManager - Added proxy rotation documentation - Updated README with new proxy rotation feature - Bumped version to 0.4.3b2 This change enables users to dynamically switch between proxies and verify IP addresses for each request. 2025-01-22 16:11:01 +08:00			`async def get_next_proxy(proxy_file: str = f"proxies.txt") -> Optional[Dict]:`
			`"""Get next proxy from local file"""`
			`try:`
			`with open(proxy_file) as f:`
			`proxies = f.read().splitlines()`
			`if not proxies:`
			`return None`

			`ip, port, username, password = random.choice(proxies).split(":")`
			`return {`
			`"server": f"http://{ip}:{port}",`
			`"username": username,`
			`"password": password,`
			`"ip": ip # Store original IP for verification`
			`}`
			`except Exception as e:`
			`print(f"Error loading proxy: {e}")`
			`return None`

			`async def demo_proxy_rotation():`
			`"""`
			`8. Proxy Rotation Demo`
			`===================`
			`Demonstrates how to rotate proxies for each request using Crawl4ai.`
			`"""`
			`print("\n=== 8. Proxy Rotation Demo ===")`


			`# Create 10 test requests to httpbin`
			`urls = ["https://httpbin.org/ip"] * 3`

			`browser_config = BrowserConfig(headless=True)`
			`run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)`

			`async with AsyncWebCrawler(config=browser_config) as crawler:`
			`for url in urls:`
			`proxy = await get_next_proxy()`
			`if not proxy:`
			`print("No proxy available, skipping...")`
			`continue`

			`# Create new config with proxy`
			`current_config = run_config.clone(proxy_config=proxy)`
			`result = await crawler.arun(url=url, config=current_config)`

			`if result.success:`
			`ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)`
			`print(f"Proxy {proxy['ip']} -> Response IP: {ip_match.group(0) if ip_match else 'Not found'}")`
			`verified = ip_match.group(0) == proxy['ip']`
			`if verified:`
			`print(f"✅ Proxy working! IP matches: {proxy['ip']}")`
			`else:`
			`print(f"❌ Proxy failed or IP mismatch!")`
			`else:`
			`print(f"Failed with proxy {proxy['ip']}")`

			`if __name__ == "__main__":`

feat(release): prepare v0.4.3 beta release Prepare the v0.4.3 beta release with major feature additions and improvements: - Add JsonXPathExtractionStrategy and LLMContentFilter to exports - Update version to 0.4.3b1 - Improve documentation for dispatchers and markdown generation - Update development status to Beta - Reorganize changelog format BREAKING CHANGE: Memory threshold in MemoryAdaptiveDispatcher increased to 90% and SemaphoreDispatcher parameter renamed to max_session_permit 2025-01-21 21:03:11 +08:00			`async def main():`
			`"""Run all feature demonstrations."""`
			`demo_memory_dispatcher(),`
			`print("\n" + "=" * 50 + "\n")`
			`demo_streaming_support(),`
			`print("\n" + "=" * 50 + "\n")`
			`demo_content_scraping(),`
			`print("\n" + "=" * 50 + "\n")`
			`demo_llm_schema_generation(),`
			`print("\n" + "=" * 50 + "\n")`
			`demo_llm_markdown(),`
			`print("\n" + "=" * 50 + "\n")`
			`demo_robots_compliance(),`
			`print("\n" + "=" * 50 + "\n")`
feat(proxy): add proxy rotation support and documentation Implements dynamic proxy rotation functionality with authentication support and IP verification. Updates include: - Added proxy rotation demo in features example - Updated proxy configuration handling in BrowserManager - Added proxy rotation documentation - Updated README with new proxy rotation feature - Bumped version to 0.4.3b2 This change enables users to dynamically switch between proxies and verify IP addresses for each request. 2025-01-22 16:11:01 +08:00			`demo_proxy_rotation()`
			`print("\n" + "=" * 50 + "\n")`
feat(release): prepare v0.4.3 beta release Prepare the v0.4.3 beta release with major feature additions and improvements: - Add JsonXPathExtractionStrategy and LLMContentFilter to exports - Update version to 0.4.3b1 - Improve documentation for dispatchers and markdown generation - Update development status to Beta - Reorganize changelog format BREAKING CHANGE: Memory threshold in MemoryAdaptiveDispatcher increased to 90% and SemaphoreDispatcher parameter renamed to max_session_permit 2025-01-21 21:03:11 +08:00
			`if __name__ == "__main__":`
			`asyncio.run(main())`