import asyncio import time import re from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, BrowserConfig, MemoryAdaptiveDispatcher, HTTPCrawlerConfig from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy from crawl4ai.deep_crawling import ( BestFirstCrawlingStrategy, FilterChain, URLPatternFilter, DomainFilter, ContentTypeFilter, ) from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy from crawl4ai import ProxyConfig from crawl4ai import RoundRobinProxyStrategy from crawl4ai.content_filter_strategy import LLMContentFilter from crawl4ai import DefaultMarkdownGenerator from crawl4ai import LLMConfig from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy from pprint import pprint # 1ļøāƒ£ Deep Crawling with Best-First Strategy async def deep_crawl(): """ PART 1: Deep Crawling with Best-First Strategy This function demonstrates: - Using the BestFirstCrawlingStrategy - Creating filter chains to narrow down crawl targets - Using a scorer to prioritize certain URLs - Respecting robots.txt rules """ print("\n===== DEEP CRAWLING =====") print("This example shows how to implement deep crawling with filters, scorers, and robots.txt compliance.") # Create a filter chain to filter urls based on patterns, domains and content type filter_chain = FilterChain( [ DomainFilter( allowed_domains=["docs.crawl4ai.com"], blocked_domains=["old.docs.crawl4ai.com"], ), URLPatternFilter(patterns=["*core*", "*advanced*"],), ContentTypeFilter(allowed_types=["text/html"]), ] ) # Create a keyword scorer that prioritises the pages with certain keywords first keyword_scorer = KeywordRelevanceScorer( keywords=["crawl", "example", "async", "configuration"], weight=0.7 ) # Set up the configuration with robots.txt compliance enabled deep_crawl_config = CrawlerRunConfig( deep_crawl_strategy=BestFirstCrawlingStrategy( max_depth=2, include_external=False, filter_chain=filter_chain, url_scorer=keyword_scorer, ), scraping_strategy=LXMLWebScrapingStrategy(), stream=True, verbose=True, check_robots_txt=True, # Enable robots.txt compliance ) # Execute the crawl async with AsyncWebCrawler() as crawler: print("\nšŸ“Š Starting deep crawl with Best-First strategy...") print(" - Filtering by domain, URL patterns, and content type") print(" - Scoring pages based on keyword relevance") print(" - Respecting robots.txt rules") start_time = time.perf_counter() results = [] async for result in await crawler.arun(url="https://docs.crawl4ai.com", config=deep_crawl_config): # Print each result as it comes in depth = result.metadata.get("depth", 0) score = result.metadata.get("score", 0) print(f"Crawled: {result.url} (Depth: {depth}), score: {score:.2f}") results.append(result) duration = time.perf_counter() - start_time # Print summary statistics print(f"\nāœ… Crawled {len(results)} high-value pages in {duration:.2f} seconds") # Group by depth if results: depth_counts = {} for result in results: depth = result.metadata.get("depth", 0) depth_counts[depth] = depth_counts.get(depth, 0) + 1 print("\nšŸ“Š Pages crawled by depth:") for depth, count in sorted(depth_counts.items()): print(f" Depth {depth}: {count} pages") # 2ļøāƒ£ Memory-Adaptive Dispatcher async def memory_adaptive_dispatcher(): """ PART 2: Memory-Adaptive Dispatcher This function demonstrates: - Using MemoryAdaptiveDispatcher to manage system memory - Batch and streaming modes with multiple URLs """ print("\n===== MEMORY-ADAPTIVE DISPATCHER =====") print("This example shows how to use the memory-adaptive dispatcher for resource management.") # Configure the dispatcher (optional, defaults are used if not provided) dispatcher = MemoryAdaptiveDispatcher( memory_threshold_percent=80.0, # Pause if memory usage exceeds 80% check_interval=0.5, # Check memory every 0.5 seconds ) # Test URLs urls = [ "https://docs.crawl4ai.com", "https://github.com/unclecode/crawl4ai" ] async def batch_mode(): print("\nšŸ“Š BATCH MODE:") print(" In this mode, all results are collected before being returned.") async with AsyncWebCrawler() as crawler: start_time = time.perf_counter() results = await crawler.arun_many( urls=urls, config=CrawlerRunConfig(stream=False), # Batch mode dispatcher=dispatcher, ) print(f" āœ… Received all {len(results)} results after {time.perf_counter() - start_time:.2f} seconds") for result in results: print(f" → {result.url} with status code: {result.status_code}") async def stream_mode(): print("\nšŸ“Š STREAMING MODE:") print(" In this mode, results are processed as they become available.") async with AsyncWebCrawler() as crawler: start_time = time.perf_counter() count = 0 first_result_time = None async for result in await crawler.arun_many( urls=urls, config=CrawlerRunConfig(stream=True), # Stream mode dispatcher=dispatcher, ): count += 1 current_time = time.perf_counter() - start_time if count == 1: first_result_time = current_time print(f" āœ… First result after {first_result_time:.2f} seconds: {result.url}") else: print(f" → Result #{count} after {current_time:.2f} seconds: {result.url}") print(f" āœ… Total: {count} results") print(f" āœ… First result: {first_result_time:.2f} seconds") print(f" āœ… All results: {time.perf_counter() - start_time:.2f} seconds") # Run both examples await batch_mode() await stream_mode() print("\nšŸ” Key Takeaway: The memory-adaptive dispatcher prevents OOM errors") print(" and manages concurrency based on system resources.") # 3ļøāƒ£ HTTP Crawler Strategy async def http_crawler_strategy(): """ PART 3: HTTP Crawler Strategy This function demonstrates: - Using the lightweight HTTP-only crawler - Setting custom headers and configurations """ print("\n===== HTTP CRAWLER STRATEGY =====") print("This example shows how to use the fast, lightweight HTTP-only crawler.") # Use the HTTP crawler strategy http_config = HTTPCrawlerConfig( method="GET", headers={"User-Agent": "MyCustomBot/1.0"}, follow_redirects=True, verify_ssl=True ) print("\nšŸ“Š Initializing HTTP crawler strategy...") print(" - Using custom User-Agent: MyCustomBot/1.0") print(" - Following redirects: Enabled") print(" - Verifying SSL: Enabled") # Create crawler with HTTP strategy async with AsyncWebCrawler( crawler_strategy=AsyncHTTPCrawlerStrategy(browser_config=http_config) ) as crawler: start_time = time.perf_counter() result = await crawler.arun("https://example.com") duration = time.perf_counter() - start_time print(f"\nāœ… Crawled in {duration:.2f} seconds") print(f"āœ… Status code: {result.status_code}") print(f"āœ… Content length: {len(result.html)} bytes") # Check if there was a redirect if result.redirected_url and result.redirected_url != result.url: print(f"ā„¹ļø Redirected from {result.url} to {result.redirected_url}") print("\nšŸ” Key Takeaway: HTTP crawler is faster and more memory-efficient") print(" than browser-based crawling for simple pages.") # 4ļøāƒ£ Proxy Rotation async def proxy_rotation(): """ PART 4: Proxy Rotation This function demonstrates: - Setting up a proxy rotation strategy - Using multiple proxies in a round-robin fashion """ print("\n===== PROXY ROTATION =====") print("This example shows how to implement proxy rotation for distributed crawling.") # Load proxies and create rotation strategy proxies = ProxyConfig.from_env() #eg: export PROXIES="ip1:port1:username1:password1,ip2:port2:username2:password2" if not proxies: print("No proxies found in environment. Set PROXIES env variable!") return proxy_strategy = RoundRobinProxyStrategy(proxies) # Create configs browser_config = BrowserConfig(headless=True, verbose=False) run_config = CrawlerRunConfig( cache_mode=CacheMode.BYPASS, proxy_rotation_strategy=proxy_strategy ) async with AsyncWebCrawler(config=browser_config) as crawler: urls = ["https://httpbin.org/ip"] * (len(proxies) * 2) # Test each proxy twice print("\nšŸ“ˆ Initializing crawler with proxy rotation...") async with AsyncWebCrawler(config=browser_config) as crawler: print("\nšŸš€ Starting batch crawl with proxy rotation...") results = await crawler.arun_many( urls=urls, config=run_config ) for result in results: if result.success: ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html) current_proxy = run_config.proxy_config if run_config.proxy_config else None if current_proxy and ip_match: print(f"URL {result.url}") print(f"Proxy {current_proxy.server} -> Response IP: {ip_match.group(0)}") verified = ip_match.group(0) == current_proxy.ip if verified: print(f"āœ… Proxy working! IP matches: {current_proxy.ip}") else: print("āŒ Proxy failed or IP mismatch!") print("---") else: print(f"āŒ Crawl via proxy failed!: {result.error_message}") # 5ļøāƒ£ LLM Content Filter (requires API key) async def llm_content_filter(): """ PART 5: LLM Content Filter This function demonstrates: - Configuring LLM providers via LLMConfig - Using LLM to generate focused markdown - LLMConfig for configuration Note: Requires a valid API key for the chosen LLM provider """ print("\n===== LLM CONTENT FILTER =====") print("This example shows how to use LLM to generate focused markdown content.") print("Note: This example requires an API key. Set it in environment variables.") # Create LLM configuration # Replace with your actual API key or set as environment variable llm_config = LLMConfig( provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY" # Will read from GEMINI_API_KEY environment variable ) print("\nšŸ“Š Setting up LLM content filter...") print(f" - Provider: {llm_config.provider}") print(" - API token: Using environment variable") print(" - Instruction: Extract key concepts and summaries") # Create markdown generator with LLM filter markdown_generator = DefaultMarkdownGenerator( content_filter=LLMContentFilter( llm_config=llm_config, instruction="Extract key concepts and summaries" ) ) config = CrawlerRunConfig(markdown_generator=markdown_generator) async with AsyncWebCrawler() as crawler: result = await crawler.arun("https://docs.crawl4ai.com", config=config) pprint(result.markdown.fit_markdown) print("\nāœ… Generated focused markdown:") # 6ļøāƒ£ PDF Processing async def pdf_processing(): """ PART 6: PDF Processing This function demonstrates: - Using PDFCrawlerStrategy and PDFContentScrapingStrategy - Extracting text and metadata from PDFs """ print("\n===== PDF PROCESSING =====") print("This example shows how to extract text and metadata from PDF files.") # Sample PDF URL pdf_url = "https://arxiv.org/pdf/2310.06825.pdf" print("\nšŸ“Š Initializing PDF crawler...") print(f" - Target PDF: {pdf_url}") print(" - Using PDFCrawlerStrategy and PDFContentScrapingStrategy") # Create crawler with PDF strategy async with AsyncWebCrawler(crawler_strategy=PDFCrawlerStrategy()) as crawler: print("\nšŸš€ Starting PDF processing...") start_time = time.perf_counter() result = await crawler.arun( pdf_url, config=CrawlerRunConfig(scraping_strategy=PDFContentScrapingStrategy()) ) duration = time.perf_counter() - start_time print(f"\nāœ… Processed PDF in {duration:.2f} seconds") # Show metadata print("\nšŸ“„ PDF Metadata:") if result.metadata: for key, value in result.metadata.items(): if key not in ["html", "text", "markdown"] and value: print(f" - {key}: {value}") else: print(" No metadata available") # Show sample of content if result.markdown: print("\nšŸ“ PDF Content Sample:") content_sample = result.markdown[:500] + "..." if len(result.markdown) > 500 else result.markdown print(f"---\n{content_sample}\n---") else: print("\nāš ļø No content extracted") print("\nšŸ” Key Takeaway: Crawl4AI can now process PDF files") print(" to extract both text content and metadata.") # 7ļøāƒ£ LLM Schema Generation (requires API key) async def llm_schema_generation(): """ PART 7: LLM Schema Generation This function demonstrates: - Configuring LLM providers via LLMConfig - Using LLM to generate extraction schemas - JsonCssExtractionStrategy Note: Requires a valid API key for the chosen LLM provider """ print("\n===== LLM SCHEMA GENERATION =====") print("This example shows how to use LLM to automatically generate extraction schemas.") print("Note: This example requires an API key. Set it in environment variables.") # Sample HTML sample_html = """

Awesome Gaming Laptop

$1,299.99
4.7/5
""" print("\nšŸ“Š Setting up LLMConfig...") # Create LLM configuration llm_config = LLMConfig( provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY" ) print("\nšŸš€ Generating schema for product extraction...") print(" This would use the LLM to analyze HTML and create an extraction schema") schema = JsonCssExtractionStrategy.generate_schema( html=sample_html, llm_config = llm_config, query="Extract product name and price" ) print("\nāœ… Generated Schema:") pprint(schema) # Run all sections async def run_tutorial(): """ Main function to run all tutorial sections. """ print("\nšŸš€ CRAWL4AI v0.5.0 TUTORIAL šŸš€") print("===============================") print("This tutorial demonstrates the key features of Crawl4AI v0.5.0") print("Including deep crawling, memory-adaptive dispatching, advanced filtering,") print("and more powerful extraction capabilities.") # Sections to run sections = [ deep_crawl, # 1. Deep Crawling with Best-First Strategy memory_adaptive_dispatcher, # 2. Memory-Adaptive Dispatcher http_crawler_strategy, # 3. HTTP Crawler Strategy proxy_rotation, # 4. Proxy Rotation llm_content_filter, # 5. LLM Content Filter pdf_processing, # 6. PDF Processing llm_schema_generation, # 7. Schema Generation using LLM ] for section in sections: try: await section() except Exception as e: print(f"āš ļø Error in {section.__name__}: {e}") print("\nšŸŽ‰ TUTORIAL COMPLETE! šŸŽ‰") print("You've now explored the key features of Crawl4AI v0.5.0") print("For more information, visit https://docs.crawl4ai.com") # Run the tutorial if __name__ == "__main__": asyncio.run(run_tutorial())