import asyncio import httpx import json import os import time from typing import List, Dict, Any, AsyncGenerator, Optional import textwrap # ← new: for pretty code literals import urllib.parse # ← needed for URL-safe /llm calls from dotenv import load_dotenv from rich.console import Console from rich.syntax import Syntax from rich.panel import Panel from rich.table import Table # --- Setup & Configuration --- load_dotenv() # Load environment variables from .env file console = Console() # --- Configuration --- BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020") BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:11235") # Target URLs SIMPLE_URL = "https://example.com" # For demo purposes SIMPLE_URL = "https://httpbin.org/html" LINKS_URL = "https://httpbin.org/links/10/0" FORMS_URL = "https://httpbin.org/forms/post" # For JS demo BOOKS_URL = "http://books.toscrape.com/" # For CSS extraction PYTHON_URL = "https://python.org" # For deeper crawl # Use the same sample site as deep crawl tests for consistency DEEP_CRAWL_BASE_URL = os.getenv( "DEEP_CRAWL_TEST_SITE", "https://docs.crawl4ai.com/samples/deepcrawl/") DEEP_CRAWL_DOMAIN = "docs.crawl4ai.com" # --- Helper Functions --- async def check_server_health(client: httpx.AsyncClient): """Check if the server is healthy before running tests.""" console.print("[bold cyan]Checking server health...[/]", end="") try: response = await client.get("/health", timeout=10.0) response.raise_for_status() health_data = response.json() console.print( f"[bold green] Server OK! Version: {health_data.get('version', 'N/A')}[/]") return True except (httpx.RequestError, httpx.HTTPStatusError) as e: console.print(f"\n[bold red]Server health check FAILED:[/]") console.print(f"Error: {e}") console.print(f"Is the server running at {BASE_URL}?") return False except Exception as e: console.print( f"\n[bold red]An unexpected error occurred during health check:[/]") console.print(e) return False def print_payload(payload: Dict[str, Any]): """Prints the JSON payload nicely with a dark theme.""" syntax = Syntax( json.dumps(payload, indent=2), "json", theme="monokai", # <--- Changed theme here line_numbers=False, word_wrap=True # Added word wrap for potentially long payloads ) console.print(Panel(syntax, title="Request Payload", border_style="blue", expand=False)) def print_result_summary(results: List[Dict[str, Any]], title: str = "Crawl Results Summary", max_items: int = 3): """Prints a concise summary of crawl results.""" if not results: console.print(f"[yellow]{title}: No results received.[/]") return console.print(Panel(f"[bold]{title}[/]", border_style="green", expand=False)) count = 0 for result in results: if count >= max_items: console.print( f"... (showing first {max_items} of {len(results)} results)") break count += 1 success_icon = "[green]✔[/]" if result.get('success') else "[red]✘[/]" url = result.get('url', 'N/A') status = result.get('status_code', 'N/A') content_info = "" if result.get('extracted_content'): content_str = json.dumps(result['extracted_content']) snippet = ( content_str[:70] + '...') if len(content_str) > 70 else content_str content_info = f" | Extracted: [cyan]{snippet}[/]" elif result.get('markdown'): content_info = f" | Markdown: [cyan]Present[/]" elif result.get('html'): content_info = f" | HTML Size: [cyan]{len(result['html'])}[/]" console.print( f"{success_icon} URL: [link={url}]{url}[/link] (Status: {status}){content_info}") if "metadata" in result and "depth" in result["metadata"]: console.print(f" Depth: {result['metadata']['depth']}") if not result.get('success') and result.get('error_message'): console.print(f" [red]Error: {result['error_message']}[/]") async def make_request(client: httpx.AsyncClient, endpoint: str, payload: Dict[str, Any], title: str) -> Optional[List[Dict[str, Any]]]: """Handles non-streaming POST requests.""" console.rule(f"[bold blue]{title}[/]", style="blue") print_payload(payload) console.print(f"Sending POST request to {client.base_url}{endpoint}...") try: start_time = time.time() response = await client.post(endpoint, json=payload) duration = time.time() - start_time console.print( f"Response Status: [bold {'green' if response.is_success else 'red'}]{response.status_code}[/] (took {duration:.2f}s)") response.raise_for_status() data = response.json() if data.get("success"): results = data.get("results", []) print_result_summary(results, title=f"{title} Results") return results else: console.print("[bold red]Request reported failure:[/]") console.print(data) return None except httpx.HTTPStatusError as e: console.print(f"[bold red]HTTP Error:[/]") console.print(f"Status: {e.response.status_code}") try: console.print(Panel(Syntax(json.dumps( e.response.json(), indent=2), "json", theme="default"), title="Error Response")) except json.JSONDecodeError: console.print(f"Response Body: {e.response.text}") except httpx.RequestError as e: console.print(f"[bold red]Request Error: {e}[/]") except Exception as e: console.print(f"[bold red]Unexpected Error: {e}[/]") return None async def stream_request(client: httpx.AsyncClient, endpoint: str, payload: Dict[str, Any], title: str): """Handles streaming POST requests.""" console.rule(f"[bold magenta]{title}[/]", style="magenta") print_payload(payload) console.print( f"Sending POST stream request to {client.base_url}{endpoint}...") all_results = [] initial_status_code = None # Store initial status code try: start_time = time.time() async with client.stream("POST", endpoint, json=payload) as response: initial_status_code = response.status_code # Capture initial status duration = time.time() - start_time # Time to first byte potentially console.print( f"Initial Response Status: [bold {'green' if response.is_success else 'red'}]{initial_status_code}[/] (first byte ~{duration:.2f}s)") response.raise_for_status() # Raise exception for bad *initial* status codes console.print("[magenta]--- Streaming Results ---[/]") completed = False async for line in response.aiter_lines(): if line: try: data = json.loads(line) if data.get("status") == "completed": completed = True console.print( "[bold green]--- Stream Completed ---[/]") break elif data.get("url"): # Looks like a result dictionary all_results.append(data) # Display summary info as it arrives success_icon = "[green]✔[/]" if data.get( 'success') else "[red]✘[/]" url = data.get('url', 'N/A') # Display status code FROM THE RESULT DATA if available result_status = data.get('status_code', 'N/A') console.print( f" {success_icon} Received: [link={url}]{url}[/link] (Status: {result_status})") if not data.get('success') and data.get('error_message'): console.print( f" [red]Error: {data['error_message']}[/]") else: console.print( f" [yellow]Stream meta-data:[/yellow] {data}") except json.JSONDecodeError: console.print( f" [red]Stream decode error for line:[/red] {line}") if not completed: console.print( "[bold yellow]Warning: Stream ended without 'completed' marker.[/]") except httpx.HTTPStatusError as e: # Use the captured initial status code if available, otherwise from the exception status = initial_status_code if initial_status_code is not None else e.response.status_code console.print(f"[bold red]HTTP Error (Initial Request):[/]") console.print(f"Status: {status}") try: console.print(Panel(Syntax(json.dumps( e.response.json(), indent=2), "json", theme="default"), title="Error Response")) except json.JSONDecodeError: console.print(f"Response Body: {e.response.text}") except httpx.RequestError as e: console.print(f"[bold red]Request Error: {e}[/]") except Exception as e: console.print(f"[bold red]Unexpected Error during streaming: {e}[/]") # Print stack trace for unexpected errors console.print_exception(show_locals=False) # Call print_result_summary with the *collected* results AFTER the stream is done print_result_summary(all_results, title=f"{title} Collected Results") def load_proxies_from_env() -> List[Dict]: """ Load proxies from the PROXIES environment variable. Expected format: IP:PORT:USER:PASS,IP:PORT,IP2:PORT2:USER2:PASS2,... Returns a list of dictionaries suitable for the 'params' of ProxyConfig. """ proxies_params_list = [] proxies_str = os.getenv("PROXIES", "") if not proxies_str: # console.print("[yellow]PROXIES environment variable not set or empty.[/]") return proxies_params_list # Return empty list if not set try: proxy_entries = proxies_str.split(",") for entry in proxy_entries: entry = entry.strip() if not entry: continue parts = entry.split(":") proxy_dict = {} if len(parts) == 4: # Format: IP:PORT:USER:PASS ip, port, username, password = parts proxy_dict = { "server": f"http://{ip}:{port}", # Assuming http protocol "username": username, "password": password, # "ip": ip # 'ip' is not a standard ProxyConfig param, 'server' contains it } elif len(parts) == 2: # Format: IP:PORT ip, port = parts proxy_dict = { "server": f"http://{ip}:{port}", # "ip": ip } else: console.print( f"[yellow]Skipping invalid proxy string format:[/yellow] {entry}") continue proxies_params_list.append(proxy_dict) except Exception as e: console.print( f"[red]Error loading proxies from environment:[/red] {e}") if proxies_params_list: console.print( f"[cyan]Loaded {len(proxies_params_list)} proxies from environment.[/]") # else: # console.print("[yellow]No valid proxies loaded from environment.[/]") return proxies_params_list # --- Demo Functions --- # 1. Basic Crawling async def demo_basic_single_url(client: httpx.AsyncClient): payload = { "urls": [SIMPLE_URL], "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, "crawler_config": { "type": "CrawlerRunConfig", "params": { "cache_mode": "BYPASS" } } } result = await make_request(client, "/crawl", payload, "Demo 1a: Basic Single URL Crawl") return result async def demo_basic_multi_url(client: httpx.AsyncClient): payload = { "urls": [SIMPLE_URL, LINKS_URL], "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, "crawler_config": {"type": "CrawlerRunConfig", "params": {"cache_mode": "BYPASS"}} } result = await make_request(client, "/crawl", payload, "Demo 1b: Basic Multi URL Crawl") return result async def demo_streaming_multi_url(client: httpx.AsyncClient): payload = { # "urls": [SIMPLE_URL, LINKS_URL, FORMS_URL, SIMPLE_URL, LINKS_URL, FORMS_URL], # Add another URL "urls": [ "https://example.com/page1", "https://example.com/page2", "https://example.com/page3", "https://example.com/page4", "https://example.com/page5" ], # Add another URL "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, "crawler_config": { "type": "CrawlerRunConfig", "params": { "stream": True, } } } result = await stream_request(client, "/crawl/stream", payload, "Demo 1c: Streaming Multi URL Crawl") return result # 2. Markdown Generation & Content Filtering async def demo_markdown_default(client: httpx.AsyncClient): payload = { "urls": [SIMPLE_URL], "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, "crawler_config": { "type": "CrawlerRunConfig", "params": { "markdown_generator": { "type": "DefaultMarkdownGenerator", "params": { "content_source": "fit_html", "options": { "type": "dict", "value": { "ignore_links": True } } } } # Explicitly default } } } result = await make_request(client, "/crawl", payload, "Demo 2a: Default Markdown Generation") return result async def demo_markdown_pruning(client: httpx.AsyncClient): payload = { "urls": [PYTHON_URL], # Use a more complex page "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, "crawler_config": { "type": "CrawlerRunConfig", "params": { "cache_mode": "BYPASS", "markdown_generator": { "type": "DefaultMarkdownGenerator", "params": { "content_filter": { "type": "PruningContentFilter", "params": { "threshold": 0.6, "threshold_type": "relative" } } } } } } } result = await make_request(client, "/crawl", payload, "Demo 2b: Markdown with Pruning Filter") return result async def demo_markdown_bm25(client: httpx.AsyncClient): payload = { "urls": [PYTHON_URL], "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, "crawler_config": { "type": "CrawlerRunConfig", "params": { "cache_mode": "BYPASS", "markdown_generator": { "type": "DefaultMarkdownGenerator", "params": { "content_filter": { "type": "BM25ContentFilter", "params": { "user_query": "Python documentation language reference" } } } } } } } result = await make_request(client, "/crawl", payload, "Demo 2c: Markdown with BM25 Filter") return result # 3. Specific Parameters # Corrected Demo Function: demo_param_css_selector async def demo_param_css_selector(client: httpx.AsyncClient): css_selector = ".main-content" # Using the suggested correct selector payload = { "urls": [PYTHON_URL], "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, "crawler_config": { "type": "CrawlerRunConfig", "params": { "css_selector": css_selector # Target specific div # No extraction strategy is needed to demo this parameter's effect on input HTML } } } results = await make_request(client, "/crawl", payload, f"Demo 3a: Using css_selector ('{css_selector}')") if results: result = results[0] if result['success'] and result.get('html'): # Check if the returned HTML is likely constrained # A simple check: does it contain expected content from within the selector, # and does it LACK content known to be outside (like footer links)? html_content = result['html'] # Text likely within .main-content somewhere content_present = 'Python Software Foundation' in html_content # Text likely in the footer, outside .main-content footer_absent = 'Legal Statements' not in html_content console.print( f" Content Check: Text inside '{css_selector}' likely present? {'[green]Yes[/]' if content_present else '[red]No[/]'}") console.print( f" Content Check: Text outside '{css_selector}' (footer) likely absent? {'[green]Yes[/]' if footer_absent else '[red]No[/]'}") if not content_present or not footer_absent: console.print( f" [yellow]Note:[/yellow] HTML filtering might not be precise or page structure changed. Result HTML length: {len(html_content)}") else: console.print( f" [green]Verified:[/green] Returned HTML appears limited by css_selector. Result HTML length: {len(html_content)}") elif result['success']: console.print( "[yellow]HTML content was empty in the successful result.[/]") # Error message is handled by print_result_summary called by make_request async def demo_param_js_execution(client: httpx.AsyncClient): payload = { "urls": ["https://example.com"], # Use a page with a form "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, "crawler_config": { "type": "CrawlerRunConfig", "params": { "cache_mode": "BYPASS", # Simple JS to fill and maybe click (won't submit without more complex setup) "js_code": """ (() => { document.querySelector('h1').innerText = 'Crawl4AI Demo'; return { filled_name: document.querySelector('h1').innerText }; })(); """, "delay_before_return_html": 0.5 # Give JS time to potentially run } } } results = await make_request(client, "/crawl", payload, "Demo 3b: Using js_code Parameter") if results and results[0].get("js_execution_result"): console.print("[cyan]JS Execution Result:[/]", results[0]["js_execution_result"]) elif results: console.print("[yellow]JS Execution Result not found in response.[/]") async def demo_param_screenshot(client: httpx.AsyncClient): payload = { "urls": [SIMPLE_URL], "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, "crawler_config": { "type": "CrawlerRunConfig", "params": {"cache_mode": "BYPASS", "screenshot": True} } } results = await make_request(client, "/crawl", payload, "Demo 3c: Taking a Screenshot") if results and results[0].get("screenshot"): console.print( f"[cyan]Screenshot data received (length):[/] {len(results[0]['screenshot'])}") elif results: console.print("[yellow]Screenshot data not found in response.[/]") async def demo_param_ssl_fetch(client: httpx.AsyncClient): payload = { "urls": [PYTHON_URL], # Needs HTTPS "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, "crawler_config": { "type": "CrawlerRunConfig", "params": {"cache_mode": "BYPASS", "fetch_ssl_certificate": True} } } results = await make_request(client, "/crawl", payload, "Demo 3d: Fetching SSL Certificate") if results and results[0].get("ssl_certificate"): console.print("[cyan]SSL Certificate Info:[/]") console.print(results[0]["ssl_certificate"]) elif results: console.print("[yellow]SSL Certificate data not found in response.[/]") async def demo_param_proxy(client: httpx.AsyncClient): proxy_params_list = load_proxies_from_env() # Get the list of parameter dicts if not proxy_params_list: console.rule( "[bold yellow]Demo 3e: Using Proxies (SKIPPED)[/]", style="yellow") console.print("Set the PROXIES environment variable to run this demo.") console.print("Format: IP:PORT:USR:PWD,IP:PORT,...") return payload = { "urls": ["https://httpbin.org/ip"], # URL that shows originating IP "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, "crawler_config": { "type": "CrawlerRunConfig", "params": { "cache_mode": "BYPASS", "proxy_rotation_strategy": { "type": "RoundRobinProxyStrategy", "params": { "proxies": [ # [ # { # "type": "ProxyConfig", # "params": { # server:"...", # "username": "...", # "password": "..." # } # }, # ... # ] # Filter out the 'ip' key when sending to server, as it's not part of ProxyConfig {"type": "ProxyConfig", "params": { k: v for k, v in p.items() if k != 'ip'}} for p in proxy_params_list ] } } } } } results = await make_request(client, "/crawl", payload, "Demo 3e: Using Proxies") # --- Verification Logic --- if results and results[0].get("success"): result = results[0] try: # httpbin.org/ip returns JSON within the HTML body's
tag html_content = result.get('html', '') # Basic extraction - find JSON withintags or just the JSON itself json_str = None if '500 else md console.print(Panel(snippet, title="Markdown snippet", border_style="cyan", expand=False)) except Exception as e: console.print(f"[bold red]Error hitting /md:[/] {e}") # 8. LLM QA helper endpoint async def demo_llm_endpoint(client: httpx.AsyncClient): """ Quick QA round-trip with /llm. Asks a trivial question against SIMPLE_URL just to show wiring. """ page_url = SIMPLE_URL question = "What is the title of this page?" console.rule("[bold magenta]Demo 7b: /llm Endpoint[/]", style="magenta") enc = urllib.parse.quote_plus(page_url, safe="") console.print(f"GET /llm/{enc}?q={question}") try: t0 = time.time() resp = await client.get(f"/llm/{enc}", params={"q": question}) dt = time.time() - t0 console.print( f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)") resp.raise_for_status() answer = resp.json().get("answer", "") console.print(Panel(answer or "No answer returned", title="LLM answer", border_style="magenta", expand=False)) except Exception as e: console.print(f"[bold red]Error hitting /llm:[/] {e}") # 9. /config/dump helpers -------------------------------------------------- async def demo_config_dump_valid(client: httpx.AsyncClient): """ Send a single top-level CrawlerRunConfig(...) expression and show the dump. """ code_snippet = "CrawlerRunConfig(cache_mode='BYPASS', screenshot=True)" payload = {"code": code_snippet} console.rule("[bold blue]Demo 8a: /config/dump (valid)[/]", style="blue") print_payload(payload) try: t0 = time.time() resp = await client.post("/config/dump", json=payload) dt = time.time() - t0 console.print( f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)") resp.raise_for_status() dump_json = resp.json() console.print(Panel(Syntax(json.dumps(dump_json, indent=2), "json", theme="monokai"), title="Dump()", border_style="cyan")) except Exception as e: console.print(f"[bold red]Error in valid /config/dump call:[/] {e}") async def demo_config_dump_invalid(client: httpx.AsyncClient): """ Purposely break the rule (nested call) to show the 400 parse error. """ bad_code = textwrap.dedent(""" BrowserConfig(headless=True); CrawlerRunConfig() """).strip() payload = {"code": bad_code} console.rule( "[bold magenta]Demo 8b: /config/dump (invalid)[/]", style="magenta") print_payload(payload) try: resp = await client.post("/config/dump", json=payload) console.print( f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/]") resp.raise_for_status() # should throw -> except except httpx.HTTPStatusError as e: console.print("[cyan]Expected parse/validation failure captured:[/]") try: console.print(Panel(Syntax(json.dumps( e.response.json(), indent=2), "json", theme="fruity"), title="Error payload")) except Exception: console.print(e.response.text) except Exception as e: console.print( f"[bold red]Unexpected error during invalid test:[/] {e}") # --- Update Main Runner to include new demo --- async def main_demo(): async with httpx.AsyncClient(base_url=BASE_URL, timeout=300.0) as client: if not await check_server_health(client): return # --- Run Demos --- # await demo_basic_single_url(client) # await demo_basic_multi_url(client) # await demo_streaming_multi_url(client) # await demo_markdown_default(client) # await demo_markdown_pruning(client) # await demo_markdown_bm25(client) # await demo_param_css_selector(client) # await demo_param_js_execution(client) # await demo_param_screenshot(client) # await demo_param_ssl_fetch(client) # await demo_param_proxy(client) # Skips if no PROXIES env var # await demo_extract_css(client) # await demo_extract_llm(client) # Skips if no common LLM key env var # await demo_deep_basic(client) # await demo_deep_streaming(client) # This need extra work # await demo_deep_with_css_extraction(client) # # Skips if no common LLM key env var # await demo_deep_with_llm_extraction(client) # await demo_deep_with_proxy(client) # Skips if no PROXIES env var # await demo_deep_with_ssl(client) # Added the new demo # --- Helper endpoints --- await demo_markdown_endpoint(client) await demo_llm_endpoint(client) # --- /config/dump sanity checks --- await demo_config_dump_valid(client) await demo_config_dump_invalid(client) console.rule("[bold green]Demo Complete[/]", style="green") if __name__ == "__main__": try: asyncio.run(main_demo()) except KeyboardInterrupt: console.print("\n[yellow]Demo interrupted by user.[/]") except Exception as e: console.print( f"\n[bold red]An error occurred during demo execution:[/]") console.print_exception(show_locals=False)