feat(cli): add command line interface with comprehensive features

Implements a full-featured CLI for Crawl4AI with the following capabilities:
- Basic and advanced web crawling
- Configuration management via YAML/JSON files
- Multiple extraction strategies (CSS, XPath, LLM)
- Content filtering and optimization
- Interactive Q&A capabilities
- Various output formats
- Comprehensive documentation and examples

Also includes:
- Home directory setup for configuration and cache
- Environment variable support for API tokens
- Test suite for CLI functionality
This commit is contained in:
UncleCode 2025-02-10 16:58:52 +08:00
parent 467be9ac76
commit 91a5fea11f
14 changed files with 983 additions and 7 deletions

View File

@ -361,7 +361,8 @@ class BrowserConfig():
@staticmethod
def load( data: dict) -> "BrowserConfig":
# Deserialize the object from a dictionary
return from_serializable_dict(data) if data else BrowserConfig()
config = from_serializable_dict(data)
return BrowserConfig.from_kwargs(config)
class CrawlerRunConfig():
@ -811,7 +812,8 @@ class CrawlerRunConfig():
@staticmethod
def load(data: dict) -> "CrawlerRunConfig":
# Deserialize the object from a dictionary
return from_serializable_dict(data) if data else CrawlerRunConfig()
config = from_serializable_dict(data)
return CrawlerRunConfig.from_kwargs(config)
def to_dict(self):
return {

View File

@ -0,0 +1,404 @@
import click
import os
from typing import Dict, Any, Optional
import json
import yaml
import anyio
from crawl4ai import (
CacheMode,
AsyncWebCrawler,
CrawlResult,
BrowserConfig,
CrawlerRunConfig,
LLMExtractionStrategy,
JsonCssExtractionStrategy,
JsonXPathExtractionStrategy,
BM25ContentFilter,
PruningContentFilter
)
from litellm import completion
from pathlib import Path
def get_global_config() -> dict:
config_dir = Path.home() / ".crawl4ai"
config_file = config_dir / "global.yml"
if not config_file.exists():
config_dir.mkdir(parents=True, exist_ok=True)
return {}
with open(config_file) as f:
return yaml.safe_load(f) or {}
def save_global_config(config: dict):
config_file = Path.home() / ".crawl4ai" / "global.yml"
with open(config_file, "w") as f:
yaml.dump(config, f)
def setup_llm_config() -> tuple[str, str]:
config = get_global_config()
provider = config.get("DEFAULT_LLM_PROVIDER")
token = config.get("DEFAULT_LLM_PROVIDER_TOKEN")
if not provider:
click.echo("\nNo default LLM provider configured.")
click.echo("Provider format: 'company/model' (e.g., 'openai/gpt-4o', 'anthropic/claude-3-sonnet')")
click.echo("See available providers at: https://docs.litellm.ai/docs/providers")
provider = click.prompt("Enter provider")
if not provider.startswith("ollama/"):
if not token:
token = click.prompt("Enter API token for " + provider, hide_input=True)
else:
token = "no-token"
if not config.get("DEFAULT_LLM_PROVIDER") or not config.get("DEFAULT_LLM_PROVIDER_TOKEN"):
config["DEFAULT_LLM_PROVIDER"] = provider
config["DEFAULT_LLM_PROVIDER_TOKEN"] = token
save_global_config(config)
click.echo("\nConfiguration saved to ~/.crawl4ai/global.yml")
return provider, token
async def stream_llm_response(url: str, markdown: str, query: str, provider: str, token: str):
response = completion(
model=provider,
api_key=token,
messages=[
{
"content": f"You are Crawl4ai assistant, answering user question based on the provided context which is crawled from {url}.",
"role": "system"
},
{
"content": f"<|start of context|>\n{markdown}\n<|end of context|>\n\n{query}",
"role": "user"
},
],
stream=True,
)
for chunk in response:
if content := chunk["choices"][0]["delta"].get("content"):
print(content, end="", flush=True)
print() # New line at end
def parse_key_values(ctx, param, value) -> Dict[str, Any]:
if not value:
return {}
result = {}
pairs = value.split(',')
for pair in pairs:
try:
k, v = pair.split('=', 1)
# Handle common value types
if v.lower() == 'true': v = True
elif v.lower() == 'false': v = False
elif v.isdigit(): v = int(v)
elif v.replace('.','',1).isdigit(): v = float(v)
elif v.startswith('[') and v.endswith(']'):
v = [x.strip() for x in v[1:-1].split(',') if x.strip()]
elif v.startswith('{') and v.endswith('}'):
try:
v = json.loads(v)
except json.JSONDecodeError:
raise click.BadParameter(f'Invalid JSON object: {v}')
result[k.strip()] = v
except ValueError:
raise click.BadParameter(f'Invalid key=value pair: {pair}')
return result
def load_config_file(path: Optional[str]) -> dict:
if not path:
return {}
try:
with open(path) as f:
if path.endswith((".yaml", ".yml")):
return yaml.safe_load(f)
return json.load(f)
except Exception as e:
raise click.BadParameter(f'Error loading config file {path}: {str(e)}')
def load_schema_file(path: Optional[str]) -> dict:
if not path:
return None
return load_config_file(path)
async def run_crawler(url: str, browser_cfg: BrowserConfig, crawler_cfg: CrawlerRunConfig, verbose: bool):
if verbose:
click.echo("Starting crawler with configurations:")
click.echo(f"Browser config: {browser_cfg.dump()}")
click.echo(f"Crawler config: {crawler_cfg.dump()}")
async with AsyncWebCrawler(config=browser_cfg) as crawler:
try:
result = await crawler.arun(url=url, config=crawler_cfg)
return result
except Exception as e:
raise click.ClickException(f"Crawling failed: {str(e)}")
def show_examples():
examples = """
🚀 Crawl4AI CLI Examples
1 Basic Usage:
# Simple crawl with default settings
crwl https://example.com
# Get markdown output
crwl https://example.com -o markdown
# Verbose JSON output with cache bypass
crwl https://example.com -o json -v --bypass-cache
2 Using Config Files:
# Using browser and crawler configs
crwl https://example.com -B browser.yml -C crawler.yml
# CSS-based extraction
crwl https://example.com -e extract_css.yml -s css_schema.json -o json
# LLM-based extraction
crwl https://example.com -e extract_llm.yml -s llm_schema.json -o json
3 Direct Parameters:
# Browser settings
crwl https://example.com -b "headless=true,viewport_width=1280,user_agent_mode=random"
# Crawler settings
crwl https://example.com -c "css_selector=#main,delay_before_return_html=2,scan_full_page=true"
4 Sample Config Files:
browser.yml:
headless: true
viewport_width: 1280
user_agent_mode: "random"
verbose: true
ignore_https_errors: true
extract_css.yml:
type: "json-css"
params:
verbose: true
css_schema.json:
{
"name": "ArticleExtractor",
"baseSelector": ".article",
"fields": [
{
"name": "title",
"selector": "h1.title",
"type": "text"
},
{
"name": "link",
"selector": "a.read-more",
"type": "attribute",
"attribute": "href"
}
]
}
extract_llm.yml:
type: "llm"
provider: "openai/gpt-4"
instruction: "Extract all articles with their titles and links"
api_token: "your-token"
params:
temperature: 0.3
max_tokens: 1000
llm_schema.json:
{
"title": "Article",
"type": "object",
"properties": {
"title": {
"type": "string",
"description": "The title of the article"
},
"link": {
"type": "string",
"description": "URL to the full article"
}
}
}
5 Advanced Usage:
# Combine configs with direct parameters
crwl https://example.com -B browser.yml -b "headless=false,viewport_width=1920"
# Full extraction pipeline
crwl https://example.com \\
-B browser.yml \\
-C crawler.yml \\
-e extract_llm.yml \\
-s llm_schema.json \\
-o json \\
-v
# Content filtering with BM25
crwl https://example.com \\
-f filter_bm25.yml \\
-o markdown-fit
For more documentation visit: https://github.com/unclecode/crawl4ai
6 Q&A with LLM:
# Ask a question about the content
crwl https://example.com -q "What is the main topic discussed?"
# First view content, then ask questions
crwl https://example.com -o markdown # See the crawled content first
crwl https://example.com -q "Summarize the key points"
crwl https://example.com -q "What are the conclusions?"
# Advanced crawling with Q&A
crwl https://example.com \\
-B browser.yml \\
-c "css_selector=article,scan_full_page=true" \\
-q "What are the pros and cons mentioned?"
Note: First time using -q will prompt for LLM provider and API token.
These will be saved in ~/.crawl4ai/global.yml for future use.
Supported provider format: 'company/model'
Examples:
- ollama/llama3.3
- openai/gpt-4
- anthropic/claude-3-sonnet
- cohere/command
- google/gemini-pro
See full list of providers: https://docs.litellm.ai/docs/providers
"""
click.echo(examples)
@click.command(context_settings={"help_option_names": ["-h", "--help"]})
@click.argument("url", required=False)
@click.option("--example", is_flag=True, help="Show usage examples")
@click.option("--browser-config", "-B", type=click.Path(exists=True), help="Browser config file (YAML/JSON)")
@click.option("--crawler-config", "-C", type=click.Path(exists=True), help="Crawler config file (YAML/JSON)")
@click.option("--filter-config", "-f", type=click.Path(exists=True), help="Content filter config file")
@click.option("--extraction-config", "-e", type=click.Path(exists=True), help="Extraction strategy config file")
@click.option("--schema", "-s", type=click.Path(exists=True), help="JSON schema for extraction")
@click.option("--browser", "-b", type=str, callback=parse_key_values, help="Browser parameters as key1=value1,key2=value2")
@click.option("--crawler", "-c", type=str, callback=parse_key_values, help="Crawler parameters as key1=value1,key2=value2")
@click.option("--output", "-o", type=click.Choice(["all", "json", "markdown", "markdown-v2", "md", "md-fit"]), default="all")
@click.option("--bypass-cache", is_flag=True, default = True, help="Bypass cache when crawling")
@click.option("--question", "-q", help="Ask a question about the crawled content")
@click.option("--verbose", "-v", is_flag=True)
def cli(url: str, example: bool, browser_config: str, crawler_config: str, filter_config: str,
extraction_config: str, schema: str, browser: Dict, crawler: Dict,
output: str, bypass_cache: bool, question: str, verbose: bool):
"""Crawl4AI CLI - Web content extraction tool
Simple Usage:
crwl https://example.com
Run with --example to see detailed usage examples."""
if example:
show_examples()
return
if not url:
raise click.UsageError("URL argument is required unless using --example")
try:
# Load base configurations
browser_cfg = BrowserConfig.load(load_config_file(browser_config))
crawler_cfg = CrawlerRunConfig.load(load_config_file(crawler_config))
# Override with CLI params
if browser:
browser_cfg = browser_cfg.clone(**browser)
if crawler:
crawler_cfg = crawler_cfg.clone(**crawler)
# Handle content filter config
if filter_config:
filter_conf = load_config_file(filter_config)
if filter_conf["type"] == "bm25":
crawler_cfg.content_filter = BM25ContentFilter(
user_query=filter_conf.get("query"),
bm25_threshold=filter_conf.get("threshold", 1.0)
)
elif filter_conf["type"] == "pruning":
crawler_cfg.content_filter = PruningContentFilter(
user_query=filter_conf.get("query"),
threshold=filter_conf.get("threshold", 0.48)
)
# Handle extraction strategy
if extraction_config:
extract_conf = load_config_file(extraction_config)
schema_data = load_schema_file(schema)
# Check if type does not exist show proper message
if not extract_conf.get("type"):
raise click.ClickException("Extraction type not specified")
if extract_conf["type"] not in ["llm", "json-css", "json-xpath"]:
raise click.ClickException(f"Invalid extraction type: {extract_conf['type']}")
if extract_conf["type"] == "llm":
# if no provider show error emssage
if not extract_conf.get("provider") or not extract_conf.get("api_token"):
raise click.ClickException("LLM provider and API token are required for LLM extraction")
crawler_cfg.extraction_strategy = LLMExtractionStrategy(
provider=extract_conf["provider"],
instruction=extract_conf["instruction"],
api_token=extract_conf.get("api_token", extract_conf.get("api_key")),
schema=schema_data,
**extract_conf.get("params", {})
)
elif extract_conf["type"] == "json-css":
crawler_cfg.extraction_strategy = JsonCssExtractionStrategy(
schema=schema_data
)
elif extract_conf["type"] == "json-xpath":
crawler_cfg.extraction_strategy = JsonXPathExtractionStrategy(
schema=schema_data
)
# No cache
if bypass_cache:
crawler_cfg.cache_mode = CacheMode.BYPASS
# Run crawler
result : CrawlResult = anyio.run(
run_crawler,
url,
browser_cfg,
crawler_cfg,
verbose
)
# Handle question
if question:
provider, token = setup_llm_config()
markdown = result.markdown_v2.raw_markdown
anyio.run(stream_llm_response, url, markdown, question, provider, token)
return
# Handle output
if output == "all":
click.echo(json.dumps(result.model_dump(), indent=2))
elif output == "json":
click.echo(json.dumps(json.loads(result.extracted_content), indent=2))
elif output in ["markdown", "md"]:
click.echo(result.markdown_v2.raw_markdown)
elif output in ["markdown-fit", "md-fit"]:
click.echo(result.markdown_v2.fit_markdown)
except Exception as e:
raise click.ClickException(str(e))
if __name__ == "__main__":
cli()

View File

@ -15,6 +15,8 @@ PROVIDER_MODELS = {
"openai/gpt-4o": os.getenv("OPENAI_API_KEY"),
"openai/o1-mini": os.getenv("OPENAI_API_KEY"),
"openai/o1-preview": os.getenv("OPENAI_API_KEY"),
"openai/o3-mini": os.getenv("OPENAI_API_KEY"),
"openai/o3-mini-high": os.getenv("OPENAI_API_KEY"),
"anthropic/claude-3-haiku-20240307": os.getenv("ANTHROPIC_API_KEY"),
"anthropic/claude-3-opus-20240229": os.getenv("ANTHROPIC_API_KEY"),
"anthropic/claude-3-sonnet-20240229": os.getenv("ANTHROPIC_API_KEY"),

View File

@ -533,11 +533,15 @@ class LLMExtractionStrategy(ExtractionStrategy):
"""
super().__init__(**kwargs)
self.provider = provider
self.api_token = (
api_token
or PROVIDER_MODELS.get(provider, "no-token")
or os.getenv("OPENAI_API_KEY")
)
if api_token and not api_token.startswith("env:"):
self.api_token = api_token
elif api_token and api_token.startswith("env:"):
self.api_token = os.getenv(api_token[4:])
else:
self.api_token = (
PROVIDER_MODELS.get(provider, "no-token")
or os.getenv("OPENAI_API_KEY")
)
self.instruction = instruction
self.extract_type = extraction_type
self.schema = schema

View File

@ -2,14 +2,47 @@ import subprocess
import sys
import asyncio
from .async_logger import AsyncLogger, LogLevel
from pathlib import Path
import os
import shutil
# Initialize logger
logger = AsyncLogger(log_level=LogLevel.DEBUG, verbose=True)
def setup_home_directory():
"""Set up the .crawl4ai folder structure in the user's home directory."""
base_dir = os.getenv("CRAWL4_AI_BASE_DIRECTORY")
crawl4ai_folder = Path(base_dir) if base_dir else Path.home()
crawl4ai_config = crawl4ai_folder / "global.yml"
crawl4ai_folder = crawl4ai_folder / ".crawl4ai"
cache_folder = crawl4ai_folder / "cache"
content_folders = [
"html_content",
"cleaned_html",
"markdown_content",
"extracted_content",
"screenshots",
]
# Clean up old cache if exists
if cache_folder.exists():
shutil.rmtree(cache_folder)
# Create new folder structure
crawl4ai_folder.mkdir(exist_ok=True)
cache_folder.mkdir(exist_ok=True)
for folder in content_folders:
(crawl4ai_folder / folder).mkdir(exist_ok=True)
# If config file does not exist, create it
if not crawl4ai_config.exists():
with open(crawl4ai_config, "w") as f:
f.write("")
def post_install():
"""Run all post-installation tasks"""
logger.info("Running post-installation setup...", tag="INIT")
setup_home_directory()
install_playwright()
run_migration()
logger.success("Post-installation setup completed!", tag="COMPLETE")

View File

@ -0,0 +1,13 @@
browser_type: "chromium"
headless: true
viewport_width: 1280
viewport_height: 800
user_agent_mode: "random"
verbose: true
text_mode: false
light_mode: false
ignore_https_errors: true
java_script_enabled: true
extra_args:
- "--disable-gpu"
- "--no-sandbox"

View File

@ -0,0 +1,13 @@
cache_mode: "bypass"
wait_until: "networkidle"
page_timeout: 30000
delay_before_return_html: 0.5
word_count_threshold: 100
scan_full_page: true
scroll_delay: 0.3
process_iframes: false
remove_overlay_elements: true
magic: true
verbose: true
exclude_external_links: true
exclude_social_media_links: true

View File

@ -0,0 +1,27 @@
{
"name": "ArticleExtractor",
"baseSelector": ".cards[data-tax=news] .card__data",
"fields": [
{
"name": "title",
"selector": "h4.card__title",
"type": "text"
},
{
"name": "link",
"selector": "h4.card__title a",
"type": "attribute",
"attribute": "href"
},
{
"name": "details",
"selector": ".card__details",
"type": "text"
},
{
"name": "topics",
"selector": ".card__topics.topics",
"type": "text"
}
]
}

View File

@ -0,0 +1,11 @@
type: "llm"
provider: "openai/gpt-4o-mini"
api_token: "env:OPENAI_API_KEY"
instruction: "Extract all articles with their titles, authors, publication dates and main topics in a structured format"
params:
chunk_token_threshold: 4096
overlap_rate: 0.1
word_token_rate: 0.75
temperature: 0.3
max_tokens: 1000
verbose: true

View File

@ -0,0 +1,3 @@
type: "json-css"
params:
verbose: true

View File

@ -0,0 +1,26 @@
{
"title": "NewsArticle",
"type": "object",
"properties": {
"title": {
"type": "string",
"description": "The title/headline of the news article"
},
"link": {
"type": "string",
"description": "The URL or link to the full article"
},
"details": {
"type": "string",
"description": "Brief summary or details about the article content"
},
"topics": {
"type": "array",
"items": {
"type": "string"
},
"description": "List of topics or categories associated with the article"
}
},
"required": ["title", "details"]
}

304
docs/md_v2/core/cli.md Normal file
View File

@ -0,0 +1,304 @@
# Crawl4AI CLI Guide
## Table of Contents
- [Installation](#installation)
- [Basic Usage](#basic-usage)
- [Configuration](#configuration)
- [Browser Configuration](#browser-configuration)
- [Crawler Configuration](#crawler-configuration)
- [Extraction Configuration](#extraction-configuration)
- [Content Filtering](#content-filtering)
- [Advanced Features](#advanced-features)
- [LLM Q&A](#llm-qa)
- [Structured Data Extraction](#structured-data-extraction)
- [Content Filtering](#content-filtering-1)
- [Output Formats](#output-formats)
- [Examples](#examples)
- [Configuration Reference](#configuration-reference)
- [Best Practices & Tips](#best-practices--tips)
## Basic Usage
The Crawl4AI CLI (`crwl`) provides a simple interface to the Crawl4AI library:
```bash
# Basic crawling
crwl https://example.com
# Get markdown output
crwl https://example.com -o markdown
# Verbose JSON output with cache bypass
crwl https://example.com -o json -v --bypass-cache
# See usage examples
crwl --example
```
## Quick Example of Advanced Usage
If you clone the repository and run the following command, you will receive the content of the page in JSON format according to a JSON-CSS schema:
```bash
crwl "https://www.infoq.com/ai-ml-data-eng/" -e docs/examples/cli/extract_css.yml -s docs/examples/cli/css_schema.json -o json;
```
## Configuration
### Browser Configuration
Browser settings can be configured via YAML file or command line parameters:
```yaml
# browser.yml
headless: true
viewport_width: 1280
user_agent_mode: "random"
verbose: true
ignore_https_errors: true
```
```bash
# Using config file
crwl https://example.com -B browser.yml
# Using direct parameters
crwl https://example.com -b "headless=true,viewport_width=1280,user_agent_mode=random"
```
### Crawler Configuration
Control crawling behavior:
```yaml
# crawler.yml
cache_mode: "bypass"
wait_until: "networkidle"
page_timeout: 30000
delay_before_return_html: 0.5
word_count_threshold: 100
scan_full_page: true
scroll_delay: 0.3
process_iframes: false
remove_overlay_elements: true
magic: true
verbose: true
```
```bash
# Using config file
crwl https://example.com -C crawler.yml
# Using direct parameters
crwl https://example.com -c "css_selector=#main,delay_before_return_html=2,scan_full_page=true"
```
### Extraction Configuration
Two types of extraction are supported:
1. CSS/XPath-based extraction:
```yaml
# extract_css.yml
type: "json-css"
params:
verbose: true
```
```json
// css_schema.json
{
"name": "ArticleExtractor",
"baseSelector": ".article",
"fields": [
{
"name": "title",
"selector": "h1.title",
"type": "text"
},
{
"name": "link",
"selector": "a.read-more",
"type": "attribute",
"attribute": "href"
}
]
}
```
2. LLM-based extraction:
```yaml
# extract_llm.yml
type: "llm"
provider: "openai/gpt-4"
instruction: "Extract all articles with their titles and links"
api_token: "your-token"
params:
temperature: 0.3
max_tokens: 1000
```
```json
// llm_schema.json
{
"title": "Article",
"type": "object",
"properties": {
"title": {
"type": "string",
"description": "The title of the article"
},
"link": {
"type": "string",
"description": "URL to the full article"
}
}
}
```
## Advanced Features
### LLM Q&A
Ask questions about crawled content:
```bash
# Simple question
crwl https://example.com -q "What is the main topic discussed?"
# View content then ask questions
crwl https://example.com -o markdown # See content first
crwl https://example.com -q "Summarize the key points"
crwl https://example.com -q "What are the conclusions?"
# Combined with advanced crawling
crwl https://example.com \
-B browser.yml \
-c "css_selector=article,scan_full_page=true" \
-q "What are the pros and cons mentioned?"
```
First-time setup:
- Prompts for LLM provider and API token
- Saves configuration in `~/.crawl4ai/global.yml`
- Supports various providers (openai/gpt-4, anthropic/claude-3-sonnet, etc.)
- For case of `ollama` you do not need to provide API token.
- See [LiteLLM Providers](https://docs.litellm.ai/docs/providers) for full list
### Structured Data Extraction
Extract structured data using CSS selectors:
```bash
crwl https://example.com \
-e extract_css.yml \
-s css_schema.json \
-o json
```
Or using LLM-based extraction:
```bash
crwl https://example.com \
-e extract_llm.yml \
-s llm_schema.json \
-o json
```
### Content Filtering
Filter content for relevance:
```yaml
# filter_bm25.yml
type: "bm25"
query: "target content"
threshold: 1.0
# filter_pruning.yml
type: "pruning"
query: "focus topic"
threshold: 0.48
```
```bash
crwl https://example.com -f filter_bm25.yml -o markdown-fit
```
## Output Formats
- `all` - Full crawl result including metadata
- `json` - Extracted structured data (when using extraction)
- `markdown` / `md` - Raw markdown output
- `markdown-fit` / `md-fit` - Filtered markdown for better readability
## Complete Examples
1. Basic Extraction:
```bash
crwl https://example.com \
-B browser.yml \
-C crawler.yml \
-o json
```
2. Structured Data Extraction:
```bash
crwl https://example.com \
-e extract_css.yml \
-s css_schema.json \
-o json \
-v
```
3. LLM Extraction with Filtering:
```bash
crwl https://example.com \
-B browser.yml \
-e extract_llm.yml \
-s llm_schema.json \
-f filter_bm25.yml \
-o json
```
4. Interactive Q&A:
```bash
# First crawl and view
crwl https://example.com -o markdown
# Then ask questions
crwl https://example.com -q "What are the main points?"
crwl https://example.com -q "Summarize the conclusions"
```
## Best Practices & Tips
1. **Configuration Management**:
- Keep common configurations in YAML files
- Use CLI parameters for quick overrides
- Store sensitive data (API tokens) in `~/.crawl4ai/global.yml`
2. **Performance Optimization**:
- Use `--bypass-cache` for fresh content
- Enable `scan_full_page` for infinite scroll pages
- Adjust `delay_before_return_html` for dynamic content
3. **Content Extraction**:
- Use CSS extraction for structured content
- Use LLM extraction for unstructured content
- Combine with filters for focused results
4. **Q&A Workflow**:
- View content first with `-o markdown`
- Ask specific questions
- Use broader context with appropriate selectors
## Recap
The Crawl4AI CLI provides:
- Flexible configuration via files and parameters
- Multiple extraction strategies (CSS, XPath, LLM)
- Content filtering and optimization
- Interactive Q&A capabilities
- Various output formats

View File

@ -15,6 +15,7 @@ nav:
- "Blog Home": "blog/index.md"
- "Changelog": "https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md"
- Core:
- "Command Line Interface": "core/cli.md"
- "Simple Crawling": "core/simple-crawling.md"
- "Crawler Result": "core/crawler-result.md"
- "Browser & Crawler Config": "core/browser-crawler-config.md"

133
tests/cli/test_cli.py Normal file
View File

@ -0,0 +1,133 @@
import pytest
from click.testing import CliRunner
from pathlib import Path
import json
import yaml
from crawl4ai.cli import cli, load_config_file, parse_key_values
import tempfile
import os
import click
@pytest.fixture
def runner():
return CliRunner()
@pytest.fixture
def temp_config_dir():
with tempfile.TemporaryDirectory() as tmpdir:
old_home = os.environ.get('HOME')
os.environ['HOME'] = tmpdir
yield Path(tmpdir)
if old_home:
os.environ['HOME'] = old_home
@pytest.fixture
def sample_configs(temp_config_dir):
configs = {
'browser.yml': {
'headless': True,
'viewport_width': 1280,
'user_agent_mode': 'random'
},
'crawler.yml': {
'cache_mode': 'bypass',
'wait_until': 'networkidle',
'scan_full_page': True
},
'extract_css.yml': {
'type': 'json-css',
'params': {'verbose': True}
},
'css_schema.json': {
'name': 'ArticleExtractor',
'baseSelector': '.article',
'fields': [
{'name': 'title', 'selector': 'h1.title', 'type': 'text'},
{'name': 'link', 'selector': 'a.read-more', 'type': 'attribute', 'attribute': 'href'}
]
}
}
for filename, content in configs.items():
path = temp_config_dir / filename
with open(path, 'w') as f:
if filename.endswith('.yml'):
yaml.dump(content, f)
else:
json.dump(content, f)
return {name: str(temp_config_dir / name) for name in configs}
class TestCLIBasics:
def test_help(self, runner):
result = runner.invoke(cli, ['--help'])
assert result.exit_code == 0
assert 'Crawl4AI CLI' in result.output
def test_examples(self, runner):
result = runner.invoke(cli, ['--example'])
assert result.exit_code == 0
assert 'Examples' in result.output
def test_missing_url(self, runner):
result = runner.invoke(cli)
assert result.exit_code != 0
assert 'URL argument is required' in result.output
class TestConfigParsing:
def test_parse_key_values_basic(self):
result = parse_key_values(None, None, "key1=value1,key2=true")
assert result == {'key1': 'value1', 'key2': True}
def test_parse_key_values_invalid(self):
with pytest.raises(click.BadParameter):
parse_key_values(None, None, "invalid_format")
class TestConfigLoading:
def test_load_yaml_config(self, sample_configs):
config = load_config_file(sample_configs['browser.yml'])
assert config['headless'] is True
assert config['viewport_width'] == 1280
def test_load_json_config(self, sample_configs):
config = load_config_file(sample_configs['css_schema.json'])
assert config['name'] == 'ArticleExtractor'
assert len(config['fields']) == 2
def test_load_nonexistent_config(self):
with pytest.raises(click.BadParameter):
load_config_file('nonexistent.yml')
class TestLLMConfig:
def test_llm_config_creation(self, temp_config_dir, runner):
def input_simulation(inputs):
return runner.invoke(cli, ['https://example.com', '-q', 'test question'],
input='\n'.join(inputs))
class TestCrawlingFeatures:
def test_basic_crawl(self, runner):
result = runner.invoke(cli, ['https://example.com'])
assert result.exit_code == 0
class TestErrorHandling:
def test_invalid_config_file(self, runner):
result = runner.invoke(cli, [
'https://example.com',
'--browser-config', 'nonexistent.yml'
])
assert result.exit_code != 0
def test_invalid_schema(self, runner, temp_config_dir):
invalid_schema = temp_config_dir / 'invalid_schema.json'
with open(invalid_schema, 'w') as f:
f.write('invalid json')
result = runner.invoke(cli, [
'https://example.com',
'--schema', str(invalid_schema)
])
assert result.exit_code != 0
if __name__ == '__main__':
pytest.main(['-v', '-s', '--tb=native', __file__])