refactor(docker-demo): enhance error handling and output formatting

Improve the Docker API demo script with better error handling, more detailed output,
and enhanced visualization:
- Add detailed error messages and stack traces for debugging
- Implement better status code handling and display
- Enhance JSON output formatting with monokai theme and word wrap
- Add depth information display for deep crawls
- Improve proxy usage reporting
- Fix port number inconsistency

No breaking changes.
This commit is contained in:
UncleCode 2025-04-17 22:32:58 +08:00
parent fd899f66aa
commit 3bf78ff47a

View File

@ -16,8 +16,8 @@ load_dotenv() # Load environment variables from .env file
console = Console()
# --- Configuration ---
BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:11235")
BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020")
BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:11235")
# Target URLs
SIMPLE_URL = "https://httpbin.org/html"
LINKS_URL = "https://httpbin.org/links/10/0"
@ -50,8 +50,14 @@ async def check_server_health(client: httpx.AsyncClient):
return False
def print_payload(payload: Dict[str, Any]):
"""Prints the JSON payload nicely."""
syntax = Syntax(json.dumps(payload, indent=2), "json", theme="default", line_numbers=False)
"""Prints the JSON payload nicely with a dark theme."""
syntax = Syntax(
json.dumps(payload, indent=2),
"json",
theme="monokai", # <--- Changed theme here
line_numbers=False,
word_wrap=True # Added word wrap for potentially long payloads
)
console.print(Panel(syntax, title="Request Payload", border_style="blue", expand=False))
def print_result_summary(results: List[Dict[str, Any]], title: str = "Crawl Results Summary", max_items: int = 3):
@ -126,12 +132,15 @@ async def stream_request(client: httpx.AsyncClient, endpoint: str, payload: Dict
print_payload(payload)
console.print(f"Sending POST stream request to {client.base_url}{endpoint}...")
all_results = []
initial_status_code = None # Store initial status code
try:
start_time = time.time()
async with client.stream("POST", endpoint, json=payload) as response:
initial_status_code = response.status_code # Capture initial status
duration = time.time() - start_time # Time to first byte potentially
console.print(f"Initial Response Status: [bold {'green' if response.status_code == 200 else 'red'}]{response.status_code}[/] (first byte ~{duration:.2f}s)")
response.raise_for_status()
console.print(f"Initial Response Status: [bold {'green' if response.is_success else 'red'}]{initial_status_code}[/] (first byte ~{duration:.2f}s)")
response.raise_for_status() # Raise exception for bad *initial* status codes
console.print("[magenta]--- Streaming Results ---[/]")
completed = False
@ -143,11 +152,16 @@ async def stream_request(client: httpx.AsyncClient, endpoint: str, payload: Dict
completed = True
console.print("[bold green]--- Stream Completed ---[/]")
break
elif data.get("url"): # Looks like a result
elif data.get("url"): # Looks like a result dictionary
all_results.append(data)
# Display summary info as it arrives
success_icon = "[green]✔[/]" if data.get('success') else "[red]✘[/]"
url = data.get('url', 'N/A')
console.print(f" {success_icon} Received: [link={url}]{url}[/link]")
# Display status code FROM THE RESULT DATA if available
result_status = data.get('status_code', 'N/A')
console.print(f" {success_icon} Received: [link={url}]{url}[/link] (Status: {result_status})")
if not data.get('success') and data.get('error_message'):
console.print(f" [red]Error: {data['error_message']}[/]")
else:
console.print(f" [yellow]Stream meta-data:[/yellow] {data}")
except json.JSONDecodeError:
@ -156,8 +170,10 @@ async def stream_request(client: httpx.AsyncClient, endpoint: str, payload: Dict
console.print("[bold yellow]Warning: Stream ended without 'completed' marker.[/]")
except httpx.HTTPStatusError as e:
console.print(f"[bold red]HTTP Error:[/]")
console.print(f"Status: {e.response.status_code}")
# Use the captured initial status code if available, otherwise from the exception
status = initial_status_code if initial_status_code is not None else e.response.status_code
console.print(f"[bold red]HTTP Error (Initial Request):[/]")
console.print(f"Status: {status}")
try:
console.print(Panel(Syntax(json.dumps(e.response.json(), indent=2), "json", theme="default"), title="Error Response"))
except json.JSONDecodeError:
@ -165,11 +181,12 @@ async def stream_request(client: httpx.AsyncClient, endpoint: str, payload: Dict
except httpx.RequestError as e:
console.print(f"[bold red]Request Error: {e}[/]")
except Exception as e:
console.print(f"[bold red]Unexpected Error: {e}[/]")
console.print(f"[bold red]Unexpected Error during streaming: {e}[/]")
console.print_exception(show_locals=False) # Print stack trace for unexpected errors
# Call print_result_summary with the *collected* results AFTER the stream is done
print_result_summary(all_results, title=f"{title} Collected Results")
def load_proxies_from_env() -> List[Dict]:
"""
Load proxies from the PROXIES environment variable.
@ -583,7 +600,7 @@ async def demo_extract_llm(client: httpx.AsyncClient):
if isinstance(extracted_data, dict):
console.print("[cyan]Extracted Data (LLM):[/]")
syntax = Syntax(json.dumps(extracted_data, indent=2), "json", theme="default", line_numbers=False)
syntax = Syntax(json.dumps(extracted_data, indent=2), "json", theme="monokai", line_numbers=False)
console.print(Panel(syntax, border_style="cyan", expand=False))
else:
console.print("[yellow]LLM extraction did not return expected dictionary.[/]")
@ -618,6 +635,12 @@ async def demo_deep_basic(client: httpx.AsyncClient):
}
results = await make_request(client, "/crawl", payload, "Demo 5a: Basic Deep Crawl")
# print_result_summary is called by make_request, showing URLs and depths
for result in results:
if result.get("success") and result.get("metadata"):
depth = result["metadata"].get("depth", "N/A")
console.print(f" Depth: {depth}")
elif not result.get("success"):
console.print(f" [red]Error: {result['error_message']}[/]")
# 5. Streaming Deep Crawl
async def demo_deep_streaming(client: httpx.AsyncClient):
@ -646,6 +669,109 @@ async def demo_deep_streaming(client: httpx.AsyncClient):
# stream_request handles printing results as they arrive
await stream_request(client, "/crawl/stream", payload, "Demo 5b: Streaming Deep Crawl")
# 5a. Deep Crawl with Filtering & Scoring
async def demo_deep_filtering_scoring(client: httpx.AsyncClient):
"""Demonstrates deep crawl with advanced URL filtering and scoring."""
max_depth = 2 # Go a bit deeper to see scoring/filtering effects
max_pages = 6
excluded_pattern = "*/category-1/*" # Example pattern to exclude
keyword_to_score = "product" # Example keyword to prioritize
payload = {
"urls": [DEEP_CRAWL_BASE_URL],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"stream": False,
"cache_mode": "BYPASS",
"deep_crawl_strategy": {
"type": "BFSDeepCrawlStrategy",
"params": {
"max_depth": max_depth,
"max_pages": max_pages,
"filter_chain": {
"type": "FilterChain",
"params": {
"filters": [
{ # Stay on the allowed domain
"type": "DomainFilter",
"params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}
},
{ # Only crawl HTML pages
"type": "ContentTypeFilter",
"params": {"allowed_types": ["text/html"]}
},
{ # Exclude URLs matching the pattern
"type": "URLPatternFilter",
"params": {
"patterns": [excluded_pattern],
"reverse": True # Block if match
}
}
]
}
},
"url_scorer": {
"type": "CompositeScorer",
"params": {
"scorers": [
{ # Boost score for URLs containing the keyword
"type": "KeywordRelevanceScorer",
"params": {"keywords": [keyword_to_score], "weight": 1.5} # Higher weight
},
{ # Slightly penalize deeper pages
"type": "PathDepthScorer",
"params": {"optimal_depth": 1, "weight": -0.1}
}
]
}
},
# Optional: Only crawl URLs scoring above a threshold
# "score_threshold": 0.1
}
}
}
}
}
results = await make_request(client, "/crawl", payload, "Demo 5c: Deep Crawl with Filtering & Scoring")
# --- Verification/Analysis ---
if results:
console.print("[cyan]Deep Crawl Filtering/Scoring Analysis:[/]")
excluded_found = False
prioritized_found_at_depth1 = False
prioritized_found_overall = False
for result in results:
url = result.get("url", "")
depth = result.get("metadata", {}).get("depth", -1)
# Check Filtering
if excluded_pattern.strip('*') in url: # Check if the excluded part is present
console.print(f" [bold red]Filter FAILED:[/bold red] Excluded pattern part '{excluded_pattern.strip('*')}' found in URL: {url}")
excluded_found = True
# Check Scoring (Observation)
if keyword_to_score in url:
prioritized_found_overall = True
if depth == 1: # Check if prioritized keywords appeared early (depth 1)
prioritized_found_at_depth1 = True
if not excluded_found:
console.print(f" [green]Filter Check:[/green] No URLs matching excluded pattern '{excluded_pattern}' found.")
else:
console.print(f" [red]Filter Check:[/red] URLs matching excluded pattern '{excluded_pattern}' were found (unexpected).")
if prioritized_found_at_depth1:
console.print(f" [green]Scoring Check:[/green] URLs with keyword '{keyword_to_score}' were found at depth 1 (scoring likely influenced).")
elif prioritized_found_overall:
console.print(f" [yellow]Scoring Check:[/yellow] URLs with keyword '{keyword_to_score}' found, but not necessarily prioritized early (check max_pages/depth limits).")
else:
console.print(f" [yellow]Scoring Check:[/yellow] No URLs with keyword '{keyword_to_score}' found within crawl limits.")
# print_result_summary called by make_request already shows URLs and depths
# 6. Deep Crawl with Extraction
async def demo_deep_with_css_extraction(client: httpx.AsyncClient):
# Schema to extract H1 and first paragraph from any page
@ -782,16 +908,26 @@ async def demo_deep_with_proxy(client: httpx.AsyncClient):
"deep_crawl_strategy": {
"type": "BFSDeepCrawlStrategy",
"params": {
"max_depth": 0, # Just crawl start URL via proxy
"max_pages": 1,
"max_depth": 1, # Just crawl start URL via proxy
"max_pages": 5,
}
}
}
}
}
# make_request calls print_result_summary, which shows URL and success status
await make_request(client, "/crawl", payload, "Demo 6c: Deep Crawl + Proxies")
results = await make_request(client, "/crawl", payload, "Demo 6c: Deep Crawl + Proxies")
if not results:
console.print("[red]No results returned from the crawl.[/]")
return
console.print("[cyan]Proxy Usage Summary from Deep Crawl:[/]")
# Verification of specific proxy IP usage would require more complex setup or server logs.
for result in results:
if result.get("success") and result.get("metadata"):
proxy_ip = result["metadata"].get("proxy_ip", "N/A")
console.print(f" Proxy IP used: {proxy_ip}")
elif not result.get("success"):
console.print(f" [red]Error: {result['error_message']}[/]")
# 6d. Deep Crawl with SSL Certificate Fetching
@ -844,26 +980,26 @@ async def main_demo():
return
# --- Run Demos ---
# await demo_basic_single_url(client)
# await demo_basic_multi_url(client)
# await demo_streaming_multi_url(client)
await demo_basic_single_url(client)
await demo_basic_multi_url(client)
await demo_streaming_multi_url(client)
# await demo_markdown_default(client)
# await demo_markdown_pruning(client)
# await demo_markdown_bm25(client)
await demo_markdown_default(client)
await demo_markdown_pruning(client)
await demo_markdown_bm25(client)
# await demo_param_css_selector(client)
# await demo_param_js_execution(client)
# await demo_param_screenshot(client)
# await demo_param_ssl_fetch(client)
# await demo_param_proxy(client) # Skips if no PROXIES env var
await demo_param_css_selector(client)
await demo_param_js_execution(client)
await demo_param_screenshot(client)
await demo_param_ssl_fetch(client)
await demo_param_proxy(client) # Skips if no PROXIES env var
# await demo_extract_css(client)
await demo_extract_css(client)
await demo_extract_llm(client) # Skips if no common LLM key env var
await demo_deep_basic(client)
await demo_deep_streaming(client)
# demo_deep_filtering_scoring skipped for brevity, add if needed
await demo_deep_streaming(client) # This need extra work
await demo_deep_with_css_extraction(client)
await demo_deep_with_llm_extraction(client) # Skips if no common LLM key env var