crawl4ai/docs/examples/docker/demo_docker_api.py
UncleCode 2140d9aca4 fix(browser): correct headless mode default behavior
Modify BrowserConfig to respect explicit headless parameter setting instead of forcing True. Update version to 0.6.2 and clean up code formatting in examples.

BREAKING CHANGE: BrowserConfig no longer defaults to headless=True when explicitly set to False
2025-04-26 21:09:50 +08:00

1318 lines
55 KiB
Python

import asyncio
import httpx
import json
import os
import time
from typing import List, Dict, Any, AsyncGenerator, Optional
import textwrap # ← new: for pretty code literals
import urllib.parse # ← needed for URL-safe /llm calls
from dotenv import load_dotenv
from rich.console import Console
from rich.syntax import Syntax
from rich.panel import Panel
from rich.table import Table
# --- Setup & Configuration ---
load_dotenv() # Load environment variables from .env file
console = Console()
# --- Configuration ---
BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020")
BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:11235")
# Target URLs
SIMPLE_URL = "https://example.com" # For demo purposes
SIMPLE_URL = "https://httpbin.org/html"
LINKS_URL = "https://httpbin.org/links/10/0"
FORMS_URL = "https://httpbin.org/forms/post" # For JS demo
BOOKS_URL = "http://books.toscrape.com/" # For CSS extraction
PYTHON_URL = "https://python.org" # For deeper crawl
# Use the same sample site as deep crawl tests for consistency
DEEP_CRAWL_BASE_URL = os.getenv(
"DEEP_CRAWL_TEST_SITE", "https://docs.crawl4ai.com/samples/deepcrawl/")
DEEP_CRAWL_DOMAIN = "docs.crawl4ai.com"
# --- Helper Functions ---
async def check_server_health(client: httpx.AsyncClient):
"""Check if the server is healthy before running tests."""
console.print("[bold cyan]Checking server health...[/]", end="")
try:
response = await client.get("/health", timeout=10.0)
response.raise_for_status()
health_data = response.json()
console.print(
f"[bold green] Server OK! Version: {health_data.get('version', 'N/A')}[/]")
return True
except (httpx.RequestError, httpx.HTTPStatusError) as e:
console.print(f"\n[bold red]Server health check FAILED:[/]")
console.print(f"Error: {e}")
console.print(f"Is the server running at {BASE_URL}?")
return False
except Exception as e:
console.print(
f"\n[bold red]An unexpected error occurred during health check:[/]")
console.print(e)
return False
def print_payload(payload: Dict[str, Any]):
"""Prints the JSON payload nicely with a dark theme."""
syntax = Syntax(
json.dumps(payload, indent=2),
"json",
theme="monokai", # <--- Changed theme here
line_numbers=False,
word_wrap=True # Added word wrap for potentially long payloads
)
console.print(Panel(syntax, title="Request Payload",
border_style="blue", expand=False))
def print_result_summary(results: List[Dict[str, Any]], title: str = "Crawl Results Summary", max_items: int = 3):
"""Prints a concise summary of crawl results."""
if not results:
console.print(f"[yellow]{title}: No results received.[/]")
return
console.print(Panel(f"[bold]{title}[/]",
border_style="green", expand=False))
count = 0
for result in results:
if count >= max_items:
console.print(
f"... (showing first {max_items} of {len(results)} results)")
break
count += 1
success_icon = "[green]✔[/]" if result.get('success') else "[red]✘[/]"
url = result.get('url', 'N/A')
status = result.get('status_code', 'N/A')
content_info = ""
if result.get('extracted_content'):
content_str = json.dumps(result['extracted_content'])
snippet = (
content_str[:70] + '...') if len(content_str) > 70 else content_str
content_info = f" | Extracted: [cyan]{snippet}[/]"
elif result.get('markdown'):
content_info = f" | Markdown: [cyan]Present[/]"
elif result.get('html'):
content_info = f" | HTML Size: [cyan]{len(result['html'])}[/]"
console.print(
f"{success_icon} URL: [link={url}]{url}[/link] (Status: {status}){content_info}")
if "metadata" in result and "depth" in result["metadata"]:
console.print(f" Depth: {result['metadata']['depth']}")
if not result.get('success') and result.get('error_message'):
console.print(f" [red]Error: {result['error_message']}[/]")
async def make_request(client: httpx.AsyncClient, endpoint: str, payload: Dict[str, Any], title: str) -> Optional[List[Dict[str, Any]]]:
"""Handles non-streaming POST requests."""
console.rule(f"[bold blue]{title}[/]", style="blue")
print_payload(payload)
console.print(f"Sending POST request to {client.base_url}{endpoint}...")
try:
start_time = time.time()
response = await client.post(endpoint, json=payload)
duration = time.time() - start_time
console.print(
f"Response Status: [bold {'green' if response.is_success else 'red'}]{response.status_code}[/] (took {duration:.2f}s)")
response.raise_for_status()
data = response.json()
if data.get("success"):
results = data.get("results", [])
print_result_summary(results, title=f"{title} Results")
return results
else:
console.print("[bold red]Request reported failure:[/]")
console.print(data)
return None
except httpx.HTTPStatusError as e:
console.print(f"[bold red]HTTP Error:[/]")
console.print(f"Status: {e.response.status_code}")
try:
console.print(Panel(Syntax(json.dumps(
e.response.json(), indent=2), "json", theme="default"), title="Error Response"))
except json.JSONDecodeError:
console.print(f"Response Body: {e.response.text}")
except httpx.RequestError as e:
console.print(f"[bold red]Request Error: {e}[/]")
except Exception as e:
console.print(f"[bold red]Unexpected Error: {e}[/]")
return None
async def stream_request(client: httpx.AsyncClient, endpoint: str, payload: Dict[str, Any], title: str):
"""Handles streaming POST requests."""
console.rule(f"[bold magenta]{title}[/]", style="magenta")
print_payload(payload)
console.print(
f"Sending POST stream request to {client.base_url}{endpoint}...")
all_results = []
initial_status_code = None # Store initial status code
try:
start_time = time.time()
async with client.stream("POST", endpoint, json=payload) as response:
initial_status_code = response.status_code # Capture initial status
duration = time.time() - start_time # Time to first byte potentially
console.print(
f"Initial Response Status: [bold {'green' if response.is_success else 'red'}]{initial_status_code}[/] (first byte ~{duration:.2f}s)")
response.raise_for_status() # Raise exception for bad *initial* status codes
console.print("[magenta]--- Streaming Results ---[/]")
completed = False
async for line in response.aiter_lines():
if line:
try:
data = json.loads(line)
if data.get("status") == "completed":
completed = True
console.print(
"[bold green]--- Stream Completed ---[/]")
break
elif data.get("url"): # Looks like a result dictionary
all_results.append(data)
# Display summary info as it arrives
success_icon = "[green]✔[/]" if data.get(
'success') else "[red]✘[/]"
url = data.get('url', 'N/A')
# Display status code FROM THE RESULT DATA if available
result_status = data.get('status_code', 'N/A')
console.print(
f" {success_icon} Received: [link={url}]{url}[/link] (Status: {result_status})")
if not data.get('success') and data.get('error_message'):
console.print(
f" [red]Error: {data['error_message']}[/]")
else:
console.print(
f" [yellow]Stream meta-data:[/yellow] {data}")
except json.JSONDecodeError:
console.print(
f" [red]Stream decode error for line:[/red] {line}")
if not completed:
console.print(
"[bold yellow]Warning: Stream ended without 'completed' marker.[/]")
except httpx.HTTPStatusError as e:
# Use the captured initial status code if available, otherwise from the exception
status = initial_status_code if initial_status_code is not None else e.response.status_code
console.print(f"[bold red]HTTP Error (Initial Request):[/]")
console.print(f"Status: {status}")
try:
console.print(Panel(Syntax(json.dumps(
e.response.json(), indent=2), "json", theme="default"), title="Error Response"))
except json.JSONDecodeError:
console.print(f"Response Body: {e.response.text}")
except httpx.RequestError as e:
console.print(f"[bold red]Request Error: {e}[/]")
except Exception as e:
console.print(f"[bold red]Unexpected Error during streaming: {e}[/]")
# Print stack trace for unexpected errors
console.print_exception(show_locals=False)
# Call print_result_summary with the *collected* results AFTER the stream is done
print_result_summary(all_results, title=f"{title} Collected Results")
def load_proxies_from_env() -> List[Dict]:
"""
Load proxies from the PROXIES environment variable.
Expected format: IP:PORT:USER:PASS,IP:PORT,IP2:PORT2:USER2:PASS2,...
Returns a list of dictionaries suitable for the 'params' of ProxyConfig.
"""
proxies_params_list = []
proxies_str = os.getenv("PROXIES", "")
if not proxies_str:
# console.print("[yellow]PROXIES environment variable not set or empty.[/]")
return proxies_params_list # Return empty list if not set
try:
proxy_entries = proxies_str.split(",")
for entry in proxy_entries:
entry = entry.strip()
if not entry:
continue
parts = entry.split(":")
proxy_dict = {}
if len(parts) == 4: # Format: IP:PORT:USER:PASS
ip, port, username, password = parts
proxy_dict = {
"server": f"http://{ip}:{port}", # Assuming http protocol
"username": username,
"password": password,
# "ip": ip # 'ip' is not a standard ProxyConfig param, 'server' contains it
}
elif len(parts) == 2: # Format: IP:PORT
ip, port = parts
proxy_dict = {
"server": f"http://{ip}:{port}",
# "ip": ip
}
else:
console.print(
f"[yellow]Skipping invalid proxy string format:[/yellow] {entry}")
continue
proxies_params_list.append(proxy_dict)
except Exception as e:
console.print(
f"[red]Error loading proxies from environment:[/red] {e}")
if proxies_params_list:
console.print(
f"[cyan]Loaded {len(proxies_params_list)} proxies from environment.[/]")
# else:
# console.print("[yellow]No valid proxies loaded from environment.[/]")
return proxies_params_list
# --- Demo Functions ---
# 1. Basic Crawling
async def demo_basic_single_url(client: httpx.AsyncClient):
payload = {
"urls": [SIMPLE_URL],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"cache_mode": "BYPASS"
}
}
}
result = await make_request(client, "/crawl", payload, "Demo 1a: Basic Single URL Crawl")
return result
async def demo_basic_multi_url(client: httpx.AsyncClient):
payload = {
"urls": [SIMPLE_URL, LINKS_URL],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {"type": "CrawlerRunConfig", "params": {"cache_mode": "BYPASS"}}
}
result = await make_request(client, "/crawl", payload, "Demo 1b: Basic Multi URL Crawl")
return result
async def demo_streaming_multi_url(client: httpx.AsyncClient):
payload = {
# "urls": [SIMPLE_URL, LINKS_URL, FORMS_URL, SIMPLE_URL, LINKS_URL, FORMS_URL], # Add another URL
"urls": [
"https://example.com/page1",
"https://example.com/page2",
"https://example.com/page3",
"https://example.com/page4",
"https://example.com/page5"
], # Add another URL
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"stream": True,
}
}
}
result = await stream_request(client, "/crawl/stream", payload, "Demo 1c: Streaming Multi URL Crawl")
return result
# 2. Markdown Generation & Content Filtering
async def demo_markdown_default(client: httpx.AsyncClient):
payload = {
"urls": [SIMPLE_URL],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"markdown_generator": {
"type": "DefaultMarkdownGenerator",
"params": {
"content_source": "fit_html",
"options": {
"type": "dict",
"value": {
"ignore_links": True
}
}
}
} # Explicitly default
}
}
}
result = await make_request(client, "/crawl", payload, "Demo 2a: Default Markdown Generation")
return result
async def demo_markdown_pruning(client: httpx.AsyncClient):
payload = {
"urls": [PYTHON_URL], # Use a more complex page
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"cache_mode": "BYPASS",
"markdown_generator": {
"type": "DefaultMarkdownGenerator",
"params": {
"content_filter": {
"type": "PruningContentFilter",
"params": {
"threshold": 0.6,
"threshold_type": "relative"
}
}
}
}
}
}
}
result = await make_request(client, "/crawl", payload, "Demo 2b: Markdown with Pruning Filter")
return result
async def demo_markdown_bm25(client: httpx.AsyncClient):
payload = {
"urls": [PYTHON_URL],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"cache_mode": "BYPASS",
"markdown_generator": {
"type": "DefaultMarkdownGenerator",
"params": {
"content_filter": {
"type": "BM25ContentFilter",
"params": {
"user_query": "Python documentation language reference"
}
}
}
}
}
}
}
result = await make_request(client, "/crawl", payload, "Demo 2c: Markdown with BM25 Filter")
return result
# 3. Specific Parameters
# Corrected Demo Function: demo_param_css_selector
async def demo_param_css_selector(client: httpx.AsyncClient):
css_selector = ".main-content" # Using the suggested correct selector
payload = {
"urls": [PYTHON_URL],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"css_selector": css_selector # Target specific div
# No extraction strategy is needed to demo this parameter's effect on input HTML
}
}
}
results = await make_request(client, "/crawl", payload, f"Demo 3a: Using css_selector ('{css_selector}')")
if results:
result = results[0]
if result['success'] and result.get('html'):
# Check if the returned HTML is likely constrained
# A simple check: does it contain expected content from within the selector,
# and does it LACK content known to be outside (like footer links)?
html_content = result['html']
# Text likely within .main-content somewhere
content_present = 'Python Software Foundation' in html_content
# Text likely in the footer, outside .main-content
footer_absent = 'Legal Statements' not in html_content
console.print(
f" Content Check: Text inside '{css_selector}' likely present? {'[green]Yes[/]' if content_present else '[red]No[/]'}")
console.print(
f" Content Check: Text outside '{css_selector}' (footer) likely absent? {'[green]Yes[/]' if footer_absent else '[red]No[/]'}")
if not content_present or not footer_absent:
console.print(
f" [yellow]Note:[/yellow] HTML filtering might not be precise or page structure changed. Result HTML length: {len(html_content)}")
else:
console.print(
f" [green]Verified:[/green] Returned HTML appears limited by css_selector. Result HTML length: {len(html_content)}")
elif result['success']:
console.print(
"[yellow]HTML content was empty in the successful result.[/]")
# Error message is handled by print_result_summary called by make_request
async def demo_param_js_execution(client: httpx.AsyncClient):
payload = {
"urls": ["https://example.com"], # Use a page with a form
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"cache_mode": "BYPASS",
# Simple JS to fill and maybe click (won't submit without more complex setup)
"js_code": """
(() => {
document.querySelector('h1').innerText = 'Crawl4AI Demo';
return { filled_name: document.querySelector('h1').innerText };
})();
""",
"delay_before_return_html": 0.5 # Give JS time to potentially run
}
}
}
results = await make_request(client, "/crawl", payload, "Demo 3b: Using js_code Parameter")
if results and results[0].get("js_execution_result"):
console.print("[cyan]JS Execution Result:[/]",
results[0]["js_execution_result"])
elif results:
console.print("[yellow]JS Execution Result not found in response.[/]")
async def demo_param_screenshot(client: httpx.AsyncClient):
payload = {
"urls": [SIMPLE_URL],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {"cache_mode": "BYPASS", "screenshot": True}
}
}
results = await make_request(client, "/crawl", payload, "Demo 3c: Taking a Screenshot")
if results and results[0].get("screenshot"):
console.print(
f"[cyan]Screenshot data received (length):[/] {len(results[0]['screenshot'])}")
elif results:
console.print("[yellow]Screenshot data not found in response.[/]")
async def demo_param_ssl_fetch(client: httpx.AsyncClient):
payload = {
"urls": [PYTHON_URL], # Needs HTTPS
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {"cache_mode": "BYPASS", "fetch_ssl_certificate": True}
}
}
results = await make_request(client, "/crawl", payload, "Demo 3d: Fetching SSL Certificate")
if results and results[0].get("ssl_certificate"):
console.print("[cyan]SSL Certificate Info:[/]")
console.print(results[0]["ssl_certificate"])
elif results:
console.print("[yellow]SSL Certificate data not found in response.[/]")
async def demo_param_proxy(client: httpx.AsyncClient):
proxy_params_list = load_proxies_from_env() # Get the list of parameter dicts
if not proxy_params_list:
console.rule(
"[bold yellow]Demo 3e: Using Proxies (SKIPPED)[/]", style="yellow")
console.print("Set the PROXIES environment variable to run this demo.")
console.print("Format: IP:PORT:USR:PWD,IP:PORT,...")
return
payload = {
"urls": ["https://httpbin.org/ip"], # URL that shows originating IP
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"cache_mode": "BYPASS",
"proxy_rotation_strategy": {
"type": "RoundRobinProxyStrategy",
"params": {
"proxies": [
# [
# {
# "type": "ProxyConfig",
# "params": {
# server:"...",
# "username": "...",
# "password": "..."
# }
# },
# ...
# ]
# Filter out the 'ip' key when sending to server, as it's not part of ProxyConfig
{"type": "ProxyConfig", "params": {
k: v for k, v in p.items() if k != 'ip'}}
for p in proxy_params_list
]
}
}
}
}
}
results = await make_request(client, "/crawl", payload, "Demo 3e: Using Proxies")
# --- Verification Logic ---
if results and results[0].get("success"):
result = results[0]
try:
# httpbin.org/ip returns JSON within the HTML body's <pre> tag
html_content = result.get('html', '')
# Basic extraction - find JSON within <pre> tags or just the JSON itself
json_str = None
if '<pre' in html_content:
start = html_content.find('{')
end = html_content.rfind('}')
if start != -1 and end != -1:
json_str = html_content[start:end+1]
elif html_content.strip().startswith('{'): # Maybe it's just JSON
json_str = html_content.strip()
if json_str:
ip_data = json.loads(json_str)
origin_ip = ip_data.get("origin")
console.print(
f" Origin IP reported by httpbin: [bold yellow]{origin_ip}[/]")
# Extract the IPs from the proxy list for comparison
proxy_ips = {p.get("server").split(
":")[1][2:] for p in proxy_params_list}
if origin_ip and origin_ip in proxy_ips:
console.print(
"[bold green] Verification SUCCESS: Origin IP matches one of the provided proxies![/]")
elif origin_ip:
console.print(
"[bold red] Verification FAILED: Origin IP does not match any provided proxy IPs.[/]")
console.print(f" Provided Proxy IPs: {proxy_ips}")
else:
console.print(
"[yellow] Verification SKIPPED: Could not extract origin IP from response.[/]")
else:
console.print(
"[yellow] Verification SKIPPED: Could not find JSON in httpbin response HTML.[/]")
# console.print(f"HTML Received:\n{html_content[:500]}...") # Uncomment for debugging
except json.JSONDecodeError:
console.print(
"[red] Verification FAILED: Could not parse JSON from httpbin response HTML.[/]")
except Exception as e:
console.print(
f"[red] Verification Error: An unexpected error occurred during IP check: {e}[/]")
elif results:
console.print(
"[yellow] Verification SKIPPED: Crawl for IP check was not successful.[/]")
# 4. Extraction Strategies
async def demo_extract_css(client: httpx.AsyncClient):
# Schema to extract book titles and prices
book_schema = {
"name": "BookList",
"baseSelector": "ol.row li.col-xs-6",
"fields": [
{"name": "title", "selector": "article.product_pod h3 a",
"type": "attribute", "attribute": "title"},
{"name": "price", "selector": "article.product_pod .price_color", "type": "text"},
]
}
payload = {
"urls": [BOOKS_URL],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"cache_mode": "BYPASS",
"extraction_strategy": {
"type": "JsonCssExtractionStrategy",
"params": {
"schema": {
"type": "dict",
"value": book_schema
}
}
}
}
}
}
results = await make_request(client, "/crawl", payload, "Demo 4a: JSON/CSS Extraction")
if results and results[0].get("success") and results[0].get("extracted_content"):
try:
extracted_data = json.loads(results[0]["extracted_content"])
if isinstance(extracted_data, list) and extracted_data:
console.print("[cyan]Sample Extracted Books (CSS):[/]")
table = Table(show_header=True, header_style="bold magenta")
table.add_column("Title", style="dim")
table.add_column("Price")
for item in extracted_data[:5]: # Show first 5
table.add_row(item.get('title', 'N/A'),
item.get('price', 'N/A'))
console.print(table)
else:
console.print(
"[yellow]CSS extraction did not return a list of results.[/]")
console.print(extracted_data)
except json.JSONDecodeError:
console.print("[red]Failed to parse extracted_content as JSON.[/]")
except Exception as e:
console.print(
f"[red]Error processing extracted CSS content: {e}[/]")
# 5. LLM Extraction
async def demo_extract_llm(client: httpx.AsyncClient):
if not os.getenv("OPENAI_API_KEY"): # Basic check for a common key
console.rule(
"[bold yellow]Demo 4b: LLM Extraction (SKIPPED)[/]", style="yellow")
console.print(
"Set an LLM API key (e.g., OPENAI_API_KEY) in your .env file or environment.")
return
payload = {
"urls": [SIMPLE_URL],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"cache_mode": "BYPASS",
"extraction_strategy": {
"type": "LLMExtractionStrategy",
"params": {
"instruction": "Extract title and author into JSON.",
"llm_config": { # Optional: Specify provider if not default
"type": "LLMConfig",
"params": {}
# Relies on server's default provider from config.yml & keys from .llm.env
# "params": {
# "provider": "openai/gpt-4o-mini",
# "api_key": os.getenv("OPENAI_API_KEY") # Optional: Override key
# }
},
"schema": { # Request structured output
"type": "dict",
"value": {
"title": "BookInfo", "type": "object",
"properties": {
"book_title": {"type": "string"},
"book_author": {"type": "string"}
}
}
}
}
}
}
}
}
results = await make_request(client, "/crawl", payload, "Demo 4b: LLM Extraction")
if results and results[0].get("success") and results[0].get("extracted_content"):
try:
extracted_data = json.loads(results[0]["extracted_content"])
# Handle potential list wrapper from server
if isinstance(extracted_data, list) and extracted_data:
extracted_data = extracted_data[0]
if isinstance(extracted_data, dict):
console.print("[cyan]Extracted Data (LLM):[/]")
syntax = Syntax(json.dumps(extracted_data, indent=2),
"json", theme="monokai", line_numbers=False)
console.print(Panel(syntax, border_style="cyan", expand=False))
else:
console.print(
"[yellow]LLM extraction did not return expected dictionary.[/]")
console.print(extracted_data)
except json.JSONDecodeError:
console.print(
"[red]Failed to parse LLM extracted_content as JSON.[/]")
except Exception as e:
console.print(
f"[red]Error processing extracted LLM content: {e}[/]")
# 6. Deep Crawling
async def demo_deep_basic(client: httpx.AsyncClient):
payload = {
"urls": [DEEP_CRAWL_BASE_URL],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"cache_mode": "BYPASS",
"deep_crawl_strategy": {
"type": "BFSDeepCrawlStrategy",
"params": {
"max_depth": 1,
"max_pages": 4,
"filter_chain": {
"type": "FilterChain",
"params": {
"filters": [
{
"type": "DomainFilter",
"params":
{
"allowed_domains": [DEEP_CRAWL_DOMAIN]
}
}
]
}
}
}
}
}
}
}
results = await make_request(client, "/crawl", payload, "Demo 5a: Basic Deep Crawl")
# print_result_summary is called by make_request, showing URLs and depths
for result in results:
if result.get("success") and result.get("metadata"):
depth = result["metadata"].get("depth", "N/A")
console.print(f" Depth: {depth}")
elif not result.get("success"):
console.print(f" [red]Error: {result['error_message']}[/]")
# 5. Streaming Deep Crawl
async def demo_deep_streaming(client: httpx.AsyncClient):
payload = {
"urls": [DEEP_CRAWL_BASE_URL],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"stream": True, # Enable streaming
"cache_mode": "BYPASS",
"deep_crawl_strategy": {
"type": "BFSDeepCrawlStrategy",
"params": {
"max_depth": 1,
"max_pages": 4,
"filter_chain": {
"type": "FilterChain",
"params": {"filters": [{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}}]}
}
}
}
}
}
}
# stream_request handles printing results as they arrive
await stream_request(client, "/crawl/stream", payload, "Demo 5b: Streaming Deep Crawl")
# 5a. Deep Crawl with Filtering & Scoring
async def demo_deep_filtering_scoring(client: httpx.AsyncClient):
"""Demonstrates deep crawl with advanced URL filtering and scoring."""
max_depth = 2 # Go a bit deeper to see scoring/filtering effects
max_pages = 6
excluded_pattern = "*/category-1/*" # Example pattern to exclude
keyword_to_score = "product" # Example keyword to prioritize
payload = {
"urls": [DEEP_CRAWL_BASE_URL],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"stream": False,
"cache_mode": "BYPASS",
"deep_crawl_strategy": {
"type": "BFSDeepCrawlStrategy",
"params": {
"max_depth": max_depth,
"max_pages": max_pages,
"filter_chain": {
"type": "FilterChain",
"params": {
"filters": [
{ # Stay on the allowed domain
"type": "DomainFilter",
"params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}
},
{ # Only crawl HTML pages
"type": "ContentTypeFilter",
"params": {"allowed_types": ["text/html"]}
},
{ # Exclude URLs matching the pattern
"type": "URLPatternFilter",
"params": {
"patterns": [excluded_pattern],
"reverse": True # Block if match
}
}
]
}
},
"url_scorer": {
"type": "CompositeScorer",
"params": {
"scorers": [
{ # Boost score for URLs containing the keyword
"type": "KeywordRelevanceScorer",
# Higher weight
"params": {"keywords": [keyword_to_score], "weight": 1.5}
},
{ # Slightly penalize deeper pages
"type": "PathDepthScorer",
"params": {"optimal_depth": 1, "weight": -0.1}
}
]
}
},
# Optional: Only crawl URLs scoring above a threshold
# "score_threshold": 0.1
}
}
}
}
}
results = await make_request(client, "/crawl", payload, "Demo 5c: Deep Crawl with Filtering & Scoring")
# --- Verification/Analysis ---
if results:
console.print("[cyan]Deep Crawl Filtering/Scoring Analysis:[/]")
excluded_found = False
prioritized_found_at_depth1 = False
prioritized_found_overall = False
for result in results:
url = result.get("url", "")
depth = result.get("metadata", {}).get("depth", -1)
# Check Filtering
# Check if the excluded part is present
if excluded_pattern.strip('*') in url:
console.print(
f" [bold red]Filter FAILED:[/bold red] Excluded pattern part '{excluded_pattern.strip('*')}' found in URL: {url}")
excluded_found = True
# Check Scoring (Observation)
if keyword_to_score in url:
prioritized_found_overall = True
# Check if prioritized keywords appeared early (depth 1)
if depth == 1:
prioritized_found_at_depth1 = True
if not excluded_found:
console.print(
f" [green]Filter Check:[/green] No URLs matching excluded pattern '{excluded_pattern}' found.")
else:
console.print(
f" [red]Filter Check:[/red] URLs matching excluded pattern '{excluded_pattern}' were found (unexpected).")
if prioritized_found_at_depth1:
console.print(
f" [green]Scoring Check:[/green] URLs with keyword '{keyword_to_score}' were found at depth 1 (scoring likely influenced).")
elif prioritized_found_overall:
console.print(
f" [yellow]Scoring Check:[/yellow] URLs with keyword '{keyword_to_score}' found, but not necessarily prioritized early (check max_pages/depth limits).")
else:
console.print(
f" [yellow]Scoring Check:[/yellow] No URLs with keyword '{keyword_to_score}' found within crawl limits.")
# print_result_summary called by make_request already shows URLs and depths
# 6. Deep Crawl with Extraction
async def demo_deep_with_css_extraction(client: httpx.AsyncClient):
# Schema to extract H1 and first paragraph from any page
general_schema = {
"name": "PageContent",
"baseSelector": "body", # Apply to whole body
"fields": [
{"name": "page_title", "selector": "h1",
"type": "text", "default": "N/A"},
{"name": "first_p", "selector": "p", "type": "text",
"default": "N/A"}, # Gets first p tag
]
}
payload = {
"urls": [DEEP_CRAWL_BASE_URL],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"cache_mode": "BYPASS",
"extraction_strategy": { # Apply CSS extraction to each page
"type": "JsonCssExtractionStrategy",
"params": {"schema": {"type": "dict", "value": general_schema}}
},
"deep_crawl_strategy": {
"type": "BFSDeepCrawlStrategy",
"params": {
"max_depth": 1,
"max_pages": 3,
"filter_chain": {
"type": "FilterChain",
"params": {"filters": [
{"type": "DomainFilter", "params": {
"allowed_domains": [DEEP_CRAWL_DOMAIN]}},
{"type": "ContentTypeFilter", "params": {
"allowed_types": ["text/html"]}}
]}
}
}
}
}
}
}
results = await make_request(client, "/crawl", payload, "Demo 6a: Deep Crawl + CSS Extraction")
if results:
console.print("[cyan]CSS Extraction Summary from Deep Crawl:[/]")
for result in results:
if result.get("success") and result.get("extracted_content"):
try:
extracted = json.loads(result["extracted_content"])
if isinstance(extracted, list) and extracted:
extracted = extracted[0] # Use first item
title = extracted.get(
'page_title', 'N/A') if isinstance(extracted, dict) else 'Parse Error'
console.print(
f" [green]✔[/] URL: [link={result['url']}]{result['url']}[/link] | Title: {title}")
except Exception:
console.print(
f" [yellow]![/] URL: [link={result['url']}]{result['url']}[/link] | Failed to parse extracted content")
elif result.get("success"):
console.print(
f" [yellow]-[/] URL: [link={result['url']}]{result['url']}[/link] | No content extracted.")
else:
console.print(
f" [red]✘[/] URL: [link={result['url']}]{result['url']}[/link] | Crawl failed.")
# 6b. Deep Crawl with LLM Extraction
async def demo_deep_with_llm_extraction(client: httpx.AsyncClient):
if not os.getenv("OPENAI_API_KEY"): # Basic check
console.rule(
"[bold yellow]Demo 6b: Deep Crawl + LLM Extraction (SKIPPED)[/]", style="yellow")
console.print(
"Set an LLM API key (e.g., OPENAI_API_KEY) in your .env file or environment.")
return
payload = {
"urls": [DEEP_CRAWL_BASE_URL],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"cache_mode": "BYPASS",
"extraction_strategy": { # Apply LLM extraction to each page
"type": "LLMExtractionStrategy",
"params": {
"instruction": "What is the main topic of this page based on the H1 and first paragraph? Respond with just the topic.",
# Rely on server default LLM config + .llm.env keys
}
},
"deep_crawl_strategy": {
"type": "BFSDeepCrawlStrategy",
"params": {
"max_depth": 1,
"max_pages": 2, # Reduce pages for LLM cost/time
"filter_chain": {
"type": "FilterChain",
"params": {"filters": [
{"type": "DomainFilter", "params": {
"allowed_domains": [DEEP_CRAWL_DOMAIN]}},
{"type": "ContentTypeFilter", "params": {
"allowed_types": ["text/html"]}}
]}
}
}
}
}
}
}
results = await make_request(client, "/crawl", payload, "Demo 6b: Deep Crawl + LLM Extraction")
if results:
console.print("[cyan]LLM Extraction Summary from Deep Crawl:[/]")
for result in results:
if result.get("success") and result.get("extracted_content"):
console.print(
f" [green]✔[/] URL: [link={result['url']}]{result['url']}[/link] | Topic: {result['extracted_content']}")
elif result.get("success"):
console.print(
f" [yellow]-[/] URL: [link={result['url']}]{result['url']}[/link] | No content extracted.")
else:
console.print(
f" [red]✘[/] URL: [link={result['url']}]{result['url']}[/link] | Crawl failed.")
# 6c. Deep Crawl with Proxies
async def demo_deep_with_proxy(client: httpx.AsyncClient):
proxy_params_list = load_proxies_from_env() # Get the list of parameter dicts
if not proxy_params_list:
console.rule(
"[bold yellow]Demo 6c: Deep Crawl + Proxies (SKIPPED)[/]", style="yellow")
console.print("Set the PROXIES environment variable to run this demo.")
return
payload = {
# Use a site likely accessible via proxies
"urls": [DEEP_CRAWL_BASE_URL],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"cache_mode": "BYPASS",
"proxy_rotation_strategy": {
"type": "RoundRobinProxyStrategy",
"params": {
# Correctly create the list of {"type": ..., "params": ...} structures, excluding the demo 'ip' key
"proxies": [
{"type": "ProxyConfig", "params": {
k: v for k, v in p.items() if k != 'ip'}}
for p in proxy_params_list
]
}
},
"deep_crawl_strategy": {
"type": "BFSDeepCrawlStrategy",
"params": {
"max_depth": 1, # Just crawl start URL via proxy
"max_pages": 5,
}
}
}
}
}
# make_request calls print_result_summary, which shows URL and success status
results = await make_request(client, "/crawl", payload, "Demo 6c: Deep Crawl + Proxies")
if not results:
console.print("[red]No results returned from the crawl.[/]")
return
console.print("[cyan]Proxy Usage Summary from Deep Crawl:[/]")
# Verification of specific proxy IP usage would require more complex setup or server logs.
for result in results:
if result.get("success") and result.get("metadata"):
proxy_ip = result["metadata"].get("proxy_ip", "N/A")
console.print(f" Proxy IP used: {proxy_ip}")
elif not result.get("success"):
console.print(f" [red]Error: {result['error_message']}[/]")
# 6d. Deep Crawl with SSL Certificate Fetching
async def demo_deep_with_ssl(client: httpx.AsyncClient):
"""Test BFS deep crawl with fetch_ssl_certificate enabled."""
payload = {
"urls": [DEEP_CRAWL_BASE_URL], # Needs HTTPS
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"stream": False,
"cache_mode": "BYPASS",
"fetch_ssl_certificate": True, # <-- Enable SSL fetching
"deep_crawl_strategy": {
"type": "BFSDeepCrawlStrategy",
"params": {
"max_depth": 1, # Crawl a bit deeper
"max_pages": 3,
"filter_chain": {
"type": "FilterChain",
"params": {"filters": [{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}}]}
}
}
}
}
}
}
results = await make_request(client, "/crawl", payload, "Demo 6d: Deep Crawl + Fetch SSL")
if results:
console.print("[cyan]SSL Certificate Summary from Deep Crawl:[/]")
for result in results:
if result.get("success") and result.get("ssl_certificate"):
cert = result["ssl_certificate"]
issuer_org = cert.get('issuer', {}).get('O', 'N/A')
valid_from = cert.get('not_before', 'N/A')
valid_to = cert.get('not_after', 'N/A')
console.print(
f" [green]✔[/] URL: [link={result['url']}]{result['url']}[/link] | Issuer: {issuer_org} | Valid: {valid_from} - {valid_to}")
elif result.get("success"):
console.print(
f" [yellow]-[/] URL: [link={result['url']}]{result['url']}[/link] | SSL cert not fetched or N/A.")
else:
console.print(
f" [red]✘[/] URL: [link={result['url']}]{result['url']}[/link] | Crawl failed.")
# 7. Markdown helper endpoint
async def demo_markdown_endpoint(client: httpx.AsyncClient):
"""
One-shot helper around /md.
Fetches PYTHON_URL with FIT filter and prints the first 500 chars of Markdown.
"""
target_url = PYTHON_URL
payload = {"url": target_url, "f": "fit", "q": None, "c": "0"}
console.rule("[bold blue]Demo 7a: /md Endpoint[/]", style="blue")
print_payload(payload)
try:
t0 = time.time()
resp = await client.post("/md", json=payload)
dt = time.time() - t0
console.print(
f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)")
resp.raise_for_status()
md = resp.json().get("markdown", "")
snippet = (md[:500] + "...") if len(md) > 500 else md
console.print(Panel(snippet, title="Markdown snippet",
border_style="cyan", expand=False))
except Exception as e:
console.print(f"[bold red]Error hitting /md:[/] {e}")
# 8. LLM QA helper endpoint
async def demo_llm_endpoint(client: httpx.AsyncClient):
"""
Quick QA round-trip with /llm.
Asks a trivial question against SIMPLE_URL just to show wiring.
"""
page_url = SIMPLE_URL
question = "What is the title of this page?"
console.rule("[bold magenta]Demo 7b: /llm Endpoint[/]", style="magenta")
enc = urllib.parse.quote_plus(page_url, safe="")
console.print(f"GET /llm/{enc}?q={question}")
try:
t0 = time.time()
resp = await client.get(f"/llm/{enc}", params={"q": question})
dt = time.time() - t0
console.print(
f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)")
resp.raise_for_status()
answer = resp.json().get("answer", "")
console.print(Panel(answer or "No answer returned",
title="LLM answer", border_style="magenta", expand=False))
except Exception as e:
console.print(f"[bold red]Error hitting /llm:[/] {e}")
# 9. /config/dump helpers --------------------------------------------------
async def demo_config_dump_valid(client: httpx.AsyncClient):
"""
Send a single top-level CrawlerRunConfig(...) expression and show the dump.
"""
code_snippet = "CrawlerRunConfig(cache_mode='BYPASS', screenshot=True)"
payload = {"code": code_snippet}
console.rule("[bold blue]Demo 8a: /config/dump (valid)[/]", style="blue")
print_payload(payload)
try:
t0 = time.time()
resp = await client.post("/config/dump", json=payload)
dt = time.time() - t0
console.print(
f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)")
resp.raise_for_status()
dump_json = resp.json()
console.print(Panel(Syntax(json.dumps(dump_json, indent=2),
"json", theme="monokai"), title="Dump()", border_style="cyan"))
except Exception as e:
console.print(f"[bold red]Error in valid /config/dump call:[/] {e}")
async def demo_config_dump_invalid(client: httpx.AsyncClient):
"""
Purposely break the rule (nested call) to show the 400 parse error.
"""
bad_code = textwrap.dedent("""
BrowserConfig(headless=True); CrawlerRunConfig()
""").strip()
payload = {"code": bad_code}
console.rule(
"[bold magenta]Demo 8b: /config/dump (invalid)[/]", style="magenta")
print_payload(payload)
try:
resp = await client.post("/config/dump", json=payload)
console.print(
f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/]")
resp.raise_for_status() # should throw -> except
except httpx.HTTPStatusError as e:
console.print("[cyan]Expected parse/validation failure captured:[/]")
try:
console.print(Panel(Syntax(json.dumps(
e.response.json(), indent=2), "json", theme="fruity"), title="Error payload"))
except Exception:
console.print(e.response.text)
except Exception as e:
console.print(
f"[bold red]Unexpected error during invalid test:[/] {e}")
# --- Update Main Runner to include new demo ---
async def main_demo():
async with httpx.AsyncClient(base_url=BASE_URL, timeout=300.0) as client:
if not await check_server_health(client):
return
# --- Run Demos ---
# await demo_basic_single_url(client)
# await demo_basic_multi_url(client)
# await demo_streaming_multi_url(client)
# await demo_markdown_default(client)
# await demo_markdown_pruning(client)
# await demo_markdown_bm25(client)
# await demo_param_css_selector(client)
# await demo_param_js_execution(client)
# await demo_param_screenshot(client)
# await demo_param_ssl_fetch(client)
# await demo_param_proxy(client) # Skips if no PROXIES env var
# await demo_extract_css(client)
# await demo_extract_llm(client) # Skips if no common LLM key env var
# await demo_deep_basic(client)
# await demo_deep_streaming(client) # This need extra work
# await demo_deep_with_css_extraction(client)
# # Skips if no common LLM key env var
# await demo_deep_with_llm_extraction(client)
# await demo_deep_with_proxy(client) # Skips if no PROXIES env var
# await demo_deep_with_ssl(client) # Added the new demo
# --- Helper endpoints ---
await demo_markdown_endpoint(client)
await demo_llm_endpoint(client)
# --- /config/dump sanity checks ---
await demo_config_dump_valid(client)
await demo_config_dump_invalid(client)
console.rule("[bold green]Demo Complete[/]", style="green")
if __name__ == "__main__":
try:
asyncio.run(main_demo())
except KeyboardInterrupt:
console.print("\n[yellow]Demo interrupted by user.[/]")
except Exception as e:
console.print(
f"\n[bold red]An error occurred during demo execution:[/]")
console.print_exception(show_locals=False)