import asyncio import time import re from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, BrowserConfig, MemoryAdaptiveDispatcher, HTTPCrawlerConfig from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy from crawl4ai.deep_crawling import ( BestFirstCrawlingStrategy, FilterChain, URLPatternFilter, DomainFilter, ContentTypeFilter, ) from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy from crawl4ai import ProxyConfig from crawl4ai import RoundRobinProxyStrategy from crawl4ai.content_filter_strategy import LLMContentFilter from crawl4ai import DefaultMarkdownGenerator from crawl4ai import LLMConfig from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy from pprint import pprint # 1ļøā£ Deep Crawling with Best-First Strategy async def deep_crawl(): """ PART 1: Deep Crawling with Best-First Strategy This function demonstrates: - Using the BestFirstCrawlingStrategy - Creating filter chains to narrow down crawl targets - Using a scorer to prioritize certain URLs - Respecting robots.txt rules """ print("\n===== DEEP CRAWLING =====") print("This example shows how to implement deep crawling with filters, scorers, and robots.txt compliance.") # Create a filter chain to filter urls based on patterns, domains and content type filter_chain = FilterChain( [ DomainFilter( allowed_domains=["docs.crawl4ai.com"], blocked_domains=["old.docs.crawl4ai.com"], ), URLPatternFilter(patterns=["*core*", "*advanced*"],), ContentTypeFilter(allowed_types=["text/html"]), ] ) # Create a keyword scorer that prioritises the pages with certain keywords first keyword_scorer = KeywordRelevanceScorer( keywords=["crawl", "example", "async", "configuration"], weight=0.7 ) # Set up the configuration with robots.txt compliance enabled deep_crawl_config = CrawlerRunConfig( deep_crawl_strategy=BestFirstCrawlingStrategy( max_depth=2, include_external=False, filter_chain=filter_chain, url_scorer=keyword_scorer, ), scraping_strategy=LXMLWebScrapingStrategy(), stream=True, verbose=True, check_robots_txt=True, # Enable robots.txt compliance ) # Execute the crawl async with AsyncWebCrawler() as crawler: print("\nš Starting deep crawl with Best-First strategy...") print(" - Filtering by domain, URL patterns, and content type") print(" - Scoring pages based on keyword relevance") print(" - Respecting robots.txt rules") start_time = time.perf_counter() results = [] async for result in await crawler.arun(url="https://docs.crawl4ai.com", config=deep_crawl_config): # Print each result as it comes in depth = result.metadata.get("depth", 0) score = result.metadata.get("score", 0) print(f"Crawled: {result.url} (Depth: {depth}), score: {score:.2f}") results.append(result) duration = time.perf_counter() - start_time # Print summary statistics print(f"\nā Crawled {len(results)} high-value pages in {duration:.2f} seconds") # Group by depth if results: depth_counts = {} for result in results: depth = result.metadata.get("depth", 0) depth_counts[depth] = depth_counts.get(depth, 0) + 1 print("\nš Pages crawled by depth:") for depth, count in sorted(depth_counts.items()): print(f" Depth {depth}: {count} pages") # 2ļøā£ Memory-Adaptive Dispatcher async def memory_adaptive_dispatcher(): """ PART 2: Memory-Adaptive Dispatcher This function demonstrates: - Using MemoryAdaptiveDispatcher to manage system memory - Batch and streaming modes with multiple URLs """ print("\n===== MEMORY-ADAPTIVE DISPATCHER =====") print("This example shows how to use the memory-adaptive dispatcher for resource management.") # Configure the dispatcher (optional, defaults are used if not provided) dispatcher = MemoryAdaptiveDispatcher( memory_threshold_percent=80.0, # Pause if memory usage exceeds 80% check_interval=0.5, # Check memory every 0.5 seconds ) # Test URLs urls = [ "https://docs.crawl4ai.com", "https://github.com/unclecode/crawl4ai" ] async def batch_mode(): print("\nš BATCH MODE:") print(" In this mode, all results are collected before being returned.") async with AsyncWebCrawler() as crawler: start_time = time.perf_counter() results = await crawler.arun_many( urls=urls, config=CrawlerRunConfig(stream=False), # Batch mode dispatcher=dispatcher, ) print(f" ā Received all {len(results)} results after {time.perf_counter() - start_time:.2f} seconds") for result in results: print(f" ā {result.url} with status code: {result.status_code}") async def stream_mode(): print("\nš STREAMING MODE:") print(" In this mode, results are processed as they become available.") async with AsyncWebCrawler() as crawler: start_time = time.perf_counter() count = 0 first_result_time = None async for result in await crawler.arun_many( urls=urls, config=CrawlerRunConfig(stream=True), # Stream mode dispatcher=dispatcher, ): count += 1 current_time = time.perf_counter() - start_time if count == 1: first_result_time = current_time print(f" ā First result after {first_result_time:.2f} seconds: {result.url}") else: print(f" ā Result #{count} after {current_time:.2f} seconds: {result.url}") print(f" ā Total: {count} results") print(f" ā First result: {first_result_time:.2f} seconds") print(f" ā All results: {time.perf_counter() - start_time:.2f} seconds") # Run both examples await batch_mode() await stream_mode() print("\nš Key Takeaway: The memory-adaptive dispatcher prevents OOM errors") print(" and manages concurrency based on system resources.") # 3ļøā£ HTTP Crawler Strategy async def http_crawler_strategy(): """ PART 3: HTTP Crawler Strategy This function demonstrates: - Using the lightweight HTTP-only crawler - Setting custom headers and configurations """ print("\n===== HTTP CRAWLER STRATEGY =====") print("This example shows how to use the fast, lightweight HTTP-only crawler.") # Use the HTTP crawler strategy http_config = HTTPCrawlerConfig( method="GET", headers={"User-Agent": "MyCustomBot/1.0"}, follow_redirects=True, verify_ssl=True ) print("\nš Initializing HTTP crawler strategy...") print(" - Using custom User-Agent: MyCustomBot/1.0") print(" - Following redirects: Enabled") print(" - Verifying SSL: Enabled") # Create crawler with HTTP strategy async with AsyncWebCrawler( crawler_strategy=AsyncHTTPCrawlerStrategy(browser_config=http_config) ) as crawler: start_time = time.perf_counter() result = await crawler.arun("https://example.com") duration = time.perf_counter() - start_time print(f"\nā Crawled in {duration:.2f} seconds") print(f"ā Status code: {result.status_code}") print(f"ā Content length: {len(result.html)} bytes") # Check if there was a redirect if result.redirected_url and result.redirected_url != result.url: print(f"ā¹ļø Redirected from {result.url} to {result.redirected_url}") print("\nš Key Takeaway: HTTP crawler is faster and more memory-efficient") print(" than browser-based crawling for simple pages.") # 4ļøā£ Proxy Rotation async def proxy_rotation(): """ PART 4: Proxy Rotation This function demonstrates: - Setting up a proxy rotation strategy - Using multiple proxies in a round-robin fashion """ print("\n===== PROXY ROTATION =====") print("This example shows how to implement proxy rotation for distributed crawling.") # Load proxies and create rotation strategy proxies = ProxyConfig.from_env() #eg: export PROXIES="ip1:port1:username1:password1,ip2:port2:username2:password2" if not proxies: print("No proxies found in environment. Set PROXIES env variable!") return proxy_strategy = RoundRobinProxyStrategy(proxies) # Create configs browser_config = BrowserConfig(headless=True, verbose=False) run_config = CrawlerRunConfig( cache_mode=CacheMode.BYPASS, proxy_rotation_strategy=proxy_strategy ) async with AsyncWebCrawler(config=browser_config) as crawler: urls = ["https://httpbin.org/ip"] * (len(proxies) * 2) # Test each proxy twice print("\nš Initializing crawler with proxy rotation...") async with AsyncWebCrawler(config=browser_config) as crawler: print("\nš Starting batch crawl with proxy rotation...") results = await crawler.arun_many( urls=urls, config=run_config ) for result in results: if result.success: ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html) current_proxy = run_config.proxy_config if run_config.proxy_config else None if current_proxy and ip_match: print(f"URL {result.url}") print(f"Proxy {current_proxy.server} -> Response IP: {ip_match.group(0)}") verified = ip_match.group(0) == current_proxy.ip if verified: print(f"ā Proxy working! IP matches: {current_proxy.ip}") else: print("ā Proxy failed or IP mismatch!") print("---") else: print(f"ā Crawl via proxy failed!: {result.error_message}") # 5ļøā£ LLM Content Filter (requires API key) async def llm_content_filter(): """ PART 5: LLM Content Filter This function demonstrates: - Configuring LLM providers via LLMConfig - Using LLM to generate focused markdown - LLMConfig for configuration Note: Requires a valid API key for the chosen LLM provider """ print("\n===== LLM CONTENT FILTER =====") print("This example shows how to use LLM to generate focused markdown content.") print("Note: This example requires an API key. Set it in environment variables.") # Create LLM configuration # Replace with your actual API key or set as environment variable llm_config = LLMConfig( provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY" # Will read from GEMINI_API_KEY environment variable ) print("\nš Setting up LLM content filter...") print(f" - Provider: {llm_config.provider}") print(" - API token: Using environment variable") print(" - Instruction: Extract key concepts and summaries") # Create markdown generator with LLM filter markdown_generator = DefaultMarkdownGenerator( content_filter=LLMContentFilter( llm_config=llm_config, instruction="Extract key concepts and summaries" ) ) config = CrawlerRunConfig(markdown_generator=markdown_generator) async with AsyncWebCrawler() as crawler: result = await crawler.arun("https://docs.crawl4ai.com", config=config) pprint(result.markdown.fit_markdown) print("\nā Generated focused markdown:") # 6ļøā£ PDF Processing async def pdf_processing(): """ PART 6: PDF Processing This function demonstrates: - Using PDFCrawlerStrategy and PDFContentScrapingStrategy - Extracting text and metadata from PDFs """ print("\n===== PDF PROCESSING =====") print("This example shows how to extract text and metadata from PDF files.") # Sample PDF URL pdf_url = "https://arxiv.org/pdf/2310.06825.pdf" print("\nš Initializing PDF crawler...") print(f" - Target PDF: {pdf_url}") print(" - Using PDFCrawlerStrategy and PDFContentScrapingStrategy") # Create crawler with PDF strategy async with AsyncWebCrawler(crawler_strategy=PDFCrawlerStrategy()) as crawler: print("\nš Starting PDF processing...") start_time = time.perf_counter() result = await crawler.arun( pdf_url, config=CrawlerRunConfig(scraping_strategy=PDFContentScrapingStrategy()) ) duration = time.perf_counter() - start_time print(f"\nā Processed PDF in {duration:.2f} seconds") # Show metadata print("\nš PDF Metadata:") if result.metadata: for key, value in result.metadata.items(): if key not in ["html", "text", "markdown"] and value: print(f" - {key}: {value}") else: print(" No metadata available") # Show sample of content if result.markdown: print("\nš PDF Content Sample:") content_sample = result.markdown[:500] + "..." if len(result.markdown) > 500 else result.markdown print(f"---\n{content_sample}\n---") else: print("\nā ļø No content extracted") print("\nš Key Takeaway: Crawl4AI can now process PDF files") print(" to extract both text content and metadata.") # 7ļøā£ LLM Schema Generation (requires API key) async def llm_schema_generation(): """ PART 7: LLM Schema Generation This function demonstrates: - Configuring LLM providers via LLMConfig - Using LLM to generate extraction schemas - JsonCssExtractionStrategy Note: Requires a valid API key for the chosen LLM provider """ print("\n===== LLM SCHEMA GENERATION =====") print("This example shows how to use LLM to automatically generate extraction schemas.") print("Note: This example requires an API key. Set it in environment variables.") # Sample HTML sample_html = """