""" Crawl4ai v0.4.3b2 Features Demo ============================ This demonstration showcases three major categories of new features in Crawl4ai v0.4.3: 1. Efficiency & Speed: - Memory-efficient dispatcher strategies - New scraping algorithm - Streaming support for batch crawling 2. LLM Integration: - Automatic schema generation - LLM-powered content filtering - Smart markdown generation 3. Core Improvements: - Robots.txt compliance - Proxy rotation - Enhanced URL handling - Shared data among hooks - add page routes Each demo function can be run independently or as part of the full suite. """ import asyncio import os import json import re import random from typing import Optional, Dict from dotenv import load_dotenv load_dotenv() from crawl4ai import ( AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, DisplayMode, MemoryAdaptiveDispatcher, CrawlerMonitor, DefaultMarkdownGenerator, LXMLWebScrapingStrategy, JsonCssExtractionStrategy, LLMContentFilter ) async def demo_memory_dispatcher(): """Demonstrates the new memory-efficient dispatcher system. Key Features: - Adaptive memory management - Real-time performance monitoring - Concurrent session control """ print("\n=== Memory Dispatcher Demo ===") try: # Configuration browser_config = BrowserConfig(headless=True, verbose=False) crawler_config = CrawlerRunConfig( cache_mode=CacheMode.BYPASS, markdown_generator=DefaultMarkdownGenerator() ) # Test URLs urls = ["http://example.com", "http://example.org", "http://example.net"] * 3 print("\nš Initializing crawler with memory monitoring...") async with AsyncWebCrawler(config=browser_config) as crawler: monitor = CrawlerMonitor( max_visible_rows=10, display_mode=DisplayMode.DETAILED ) dispatcher = MemoryAdaptiveDispatcher( memory_threshold_percent=80.0, check_interval=0.5, max_session_permit=5, monitor=monitor ) print("\nš Starting batch crawl...") results = await crawler.arun_many( urls=urls, config=crawler_config, dispatcher=dispatcher ) print(f"\nā Completed {len(results)} URLs successfully") except Exception as e: print(f"\nā Error in memory dispatcher demo: {str(e)}") async def demo_streaming_support(): """ 2. Streaming Support Demo ====================== Shows how to process URLs as they complete using streaming """ print("\n=== 2. Streaming Support Demo ===") browser_config = BrowserConfig(headless=True, verbose=False) crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, stream=True) # Test URLs urls = ["http://example.com", "http://example.org", "http://example.net"] * 2 async with AsyncWebCrawler(config=browser_config) as crawler: # Initialize dispatcher for streaming dispatcher = MemoryAdaptiveDispatcher(max_session_permit=3, check_interval=0.5) print("Starting streaming crawl...") async for result in await crawler.arun_many( urls=urls, config=crawler_config, dispatcher=dispatcher ): # Process each result as it arrives print( f"Received result for {result.url} - Success: {result.success}" ) if result.success: print(f"Content length: {len(result.markdown)}") async def demo_content_scraping(): """ 3. Content Scraping Strategy Demo ============================== Demonstrates the new LXMLWebScrapingStrategy for faster content scraping. """ print("\n=== 3. Content Scraping Strategy Demo ===") crawler = AsyncWebCrawler() url = "https://example.com/article" # Configure with the new LXML strategy config = CrawlerRunConfig( scraping_strategy=LXMLWebScrapingStrategy(), verbose=True ) print("Scraping content with LXML strategy...") async with crawler: result = await crawler.arun(url, config=config) if result.success: print("Successfully scraped content using LXML strategy") async def demo_llm_markdown(): """ 4. LLM-Powered Markdown Generation Demo =================================== Shows how to use the new LLM-powered content filtering and markdown generation. """ print("\n=== 4. LLM-Powered Markdown Generation Demo ===") crawler = AsyncWebCrawler() url = "https://docs.python.org/3/tutorial/classes.html" content_filter = LLMContentFilter( provider="openai/gpt-4o", api_token=os.getenv("OPENAI_API_KEY"), instruction=""" Focus on extracting the core educational content about Python classes. Include: - Key concepts and their explanations - Important code examples - Essential technical details Exclude: - Navigation elements - Sidebars - Footer content - Version information - Any non-essential UI elements Format the output as clean markdown with proper code blocks and headers. """, verbose=True, ) # Configure LLM-powered markdown generation config = CrawlerRunConfig( markdown_generator=DefaultMarkdownGenerator( content_filter=content_filter ), cache_mode = CacheMode.BYPASS, verbose=True ) print("Generating focused markdown with LLM...") async with crawler: result = await crawler.arun(url, config=config) if result.success and result.markdown_v2: print("Successfully generated LLM-filtered markdown") print("First 500 chars of filtered content:") print(result.markdown_v2.fit_markdown[:500]) print("Successfully generated LLM-filtered markdown") async def demo_robots_compliance(): """ 5. Robots.txt Compliance Demo ========================== Demonstrates the new robots.txt compliance feature with SQLite caching. """ print("\n=== 5. Robots.txt Compliance Demo ===") crawler = AsyncWebCrawler() urls = ["https://example.com", "https://facebook.com", "https://twitter.com"] # Enable robots.txt checking config = CrawlerRunConfig(check_robots_txt=True, verbose=True) print("Crawling with robots.txt compliance...") async with crawler: results = await crawler.arun_many(urls, config=config) for result in results: if result.status_code == 403: print(f"Access blocked by robots.txt: {result.url}") elif result.success: print(f"Successfully crawled: {result.url}") async def demo_json_schema_generation(): """ 7. LLM-Powered Schema Generation Demo ================================= Demonstrates automatic CSS and XPath schema generation using LLM models. """ print("\n=== 7. LLM-Powered Schema Generation Demo ===") # Example HTML content for a job listing html_content = """