mirror of
https://github.com/unclecode/crawl4ai.git
synced 2025-12-29 11:27:52 +00:00
feat(cli): add command line interface with comprehensive features
Implements a full-featured CLI for Crawl4AI with the following capabilities: - Basic and advanced web crawling - Configuration management via YAML/JSON files - Multiple extraction strategies (CSS, XPath, LLM) - Content filtering and optimization - Interactive Q&A capabilities - Various output formats - Comprehensive documentation and examples Also includes: - Home directory setup for configuration and cache - Environment variable support for API tokens - Test suite for CLI functionality
This commit is contained in:
parent
467be9ac76
commit
91a5fea11f
@ -361,7 +361,8 @@ class BrowserConfig():
|
||||
@staticmethod
|
||||
def load( data: dict) -> "BrowserConfig":
|
||||
# Deserialize the object from a dictionary
|
||||
return from_serializable_dict(data) if data else BrowserConfig()
|
||||
config = from_serializable_dict(data)
|
||||
return BrowserConfig.from_kwargs(config)
|
||||
|
||||
|
||||
class CrawlerRunConfig():
|
||||
@ -811,7 +812,8 @@ class CrawlerRunConfig():
|
||||
@staticmethod
|
||||
def load(data: dict) -> "CrawlerRunConfig":
|
||||
# Deserialize the object from a dictionary
|
||||
return from_serializable_dict(data) if data else CrawlerRunConfig()
|
||||
config = from_serializable_dict(data)
|
||||
return CrawlerRunConfig.from_kwargs(config)
|
||||
|
||||
def to_dict(self):
|
||||
return {
|
||||
|
||||
404
crawl4ai/cli.py
404
crawl4ai/cli.py
@ -0,0 +1,404 @@
|
||||
import click
|
||||
import os
|
||||
from typing import Dict, Any, Optional
|
||||
import json
|
||||
import yaml
|
||||
import anyio
|
||||
from crawl4ai import (
|
||||
CacheMode,
|
||||
AsyncWebCrawler,
|
||||
CrawlResult,
|
||||
BrowserConfig,
|
||||
CrawlerRunConfig,
|
||||
LLMExtractionStrategy,
|
||||
JsonCssExtractionStrategy,
|
||||
JsonXPathExtractionStrategy,
|
||||
BM25ContentFilter,
|
||||
PruningContentFilter
|
||||
)
|
||||
from litellm import completion
|
||||
from pathlib import Path
|
||||
|
||||
def get_global_config() -> dict:
|
||||
config_dir = Path.home() / ".crawl4ai"
|
||||
config_file = config_dir / "global.yml"
|
||||
|
||||
if not config_file.exists():
|
||||
config_dir.mkdir(parents=True, exist_ok=True)
|
||||
return {}
|
||||
|
||||
with open(config_file) as f:
|
||||
return yaml.safe_load(f) or {}
|
||||
|
||||
def save_global_config(config: dict):
|
||||
config_file = Path.home() / ".crawl4ai" / "global.yml"
|
||||
with open(config_file, "w") as f:
|
||||
yaml.dump(config, f)
|
||||
|
||||
def setup_llm_config() -> tuple[str, str]:
|
||||
config = get_global_config()
|
||||
provider = config.get("DEFAULT_LLM_PROVIDER")
|
||||
token = config.get("DEFAULT_LLM_PROVIDER_TOKEN")
|
||||
|
||||
if not provider:
|
||||
click.echo("\nNo default LLM provider configured.")
|
||||
click.echo("Provider format: 'company/model' (e.g., 'openai/gpt-4o', 'anthropic/claude-3-sonnet')")
|
||||
click.echo("See available providers at: https://docs.litellm.ai/docs/providers")
|
||||
provider = click.prompt("Enter provider")
|
||||
|
||||
if not provider.startswith("ollama/"):
|
||||
if not token:
|
||||
token = click.prompt("Enter API token for " + provider, hide_input=True)
|
||||
else:
|
||||
token = "no-token"
|
||||
|
||||
if not config.get("DEFAULT_LLM_PROVIDER") or not config.get("DEFAULT_LLM_PROVIDER_TOKEN"):
|
||||
config["DEFAULT_LLM_PROVIDER"] = provider
|
||||
config["DEFAULT_LLM_PROVIDER_TOKEN"] = token
|
||||
save_global_config(config)
|
||||
click.echo("\nConfiguration saved to ~/.crawl4ai/global.yml")
|
||||
|
||||
return provider, token
|
||||
|
||||
async def stream_llm_response(url: str, markdown: str, query: str, provider: str, token: str):
|
||||
response = completion(
|
||||
model=provider,
|
||||
api_key=token,
|
||||
messages=[
|
||||
{
|
||||
"content": f"You are Crawl4ai assistant, answering user question based on the provided context which is crawled from {url}.",
|
||||
"role": "system"
|
||||
},
|
||||
{
|
||||
"content": f"<|start of context|>\n{markdown}\n<|end of context|>\n\n{query}",
|
||||
"role": "user"
|
||||
},
|
||||
],
|
||||
stream=True,
|
||||
)
|
||||
|
||||
for chunk in response:
|
||||
if content := chunk["choices"][0]["delta"].get("content"):
|
||||
print(content, end="", flush=True)
|
||||
print() # New line at end
|
||||
|
||||
|
||||
|
||||
def parse_key_values(ctx, param, value) -> Dict[str, Any]:
|
||||
if not value:
|
||||
return {}
|
||||
result = {}
|
||||
pairs = value.split(',')
|
||||
for pair in pairs:
|
||||
try:
|
||||
k, v = pair.split('=', 1)
|
||||
# Handle common value types
|
||||
if v.lower() == 'true': v = True
|
||||
elif v.lower() == 'false': v = False
|
||||
elif v.isdigit(): v = int(v)
|
||||
elif v.replace('.','',1).isdigit(): v = float(v)
|
||||
elif v.startswith('[') and v.endswith(']'):
|
||||
v = [x.strip() for x in v[1:-1].split(',') if x.strip()]
|
||||
elif v.startswith('{') and v.endswith('}'):
|
||||
try:
|
||||
v = json.loads(v)
|
||||
except json.JSONDecodeError:
|
||||
raise click.BadParameter(f'Invalid JSON object: {v}')
|
||||
result[k.strip()] = v
|
||||
except ValueError:
|
||||
raise click.BadParameter(f'Invalid key=value pair: {pair}')
|
||||
return result
|
||||
|
||||
def load_config_file(path: Optional[str]) -> dict:
|
||||
if not path:
|
||||
return {}
|
||||
|
||||
try:
|
||||
with open(path) as f:
|
||||
if path.endswith((".yaml", ".yml")):
|
||||
return yaml.safe_load(f)
|
||||
return json.load(f)
|
||||
except Exception as e:
|
||||
raise click.BadParameter(f'Error loading config file {path}: {str(e)}')
|
||||
|
||||
def load_schema_file(path: Optional[str]) -> dict:
|
||||
if not path:
|
||||
return None
|
||||
return load_config_file(path)
|
||||
|
||||
async def run_crawler(url: str, browser_cfg: BrowserConfig, crawler_cfg: CrawlerRunConfig, verbose: bool):
|
||||
if verbose:
|
||||
click.echo("Starting crawler with configurations:")
|
||||
click.echo(f"Browser config: {browser_cfg.dump()}")
|
||||
click.echo(f"Crawler config: {crawler_cfg.dump()}")
|
||||
|
||||
async with AsyncWebCrawler(config=browser_cfg) as crawler:
|
||||
try:
|
||||
result = await crawler.arun(url=url, config=crawler_cfg)
|
||||
return result
|
||||
except Exception as e:
|
||||
raise click.ClickException(f"Crawling failed: {str(e)}")
|
||||
|
||||
def show_examples():
|
||||
examples = """
|
||||
🚀 Crawl4AI CLI Examples
|
||||
|
||||
1️⃣ Basic Usage:
|
||||
# Simple crawl with default settings
|
||||
crwl https://example.com
|
||||
|
||||
# Get markdown output
|
||||
crwl https://example.com -o markdown
|
||||
|
||||
# Verbose JSON output with cache bypass
|
||||
crwl https://example.com -o json -v --bypass-cache
|
||||
|
||||
2️⃣ Using Config Files:
|
||||
# Using browser and crawler configs
|
||||
crwl https://example.com -B browser.yml -C crawler.yml
|
||||
|
||||
# CSS-based extraction
|
||||
crwl https://example.com -e extract_css.yml -s css_schema.json -o json
|
||||
|
||||
# LLM-based extraction
|
||||
crwl https://example.com -e extract_llm.yml -s llm_schema.json -o json
|
||||
|
||||
3️⃣ Direct Parameters:
|
||||
# Browser settings
|
||||
crwl https://example.com -b "headless=true,viewport_width=1280,user_agent_mode=random"
|
||||
|
||||
# Crawler settings
|
||||
crwl https://example.com -c "css_selector=#main,delay_before_return_html=2,scan_full_page=true"
|
||||
|
||||
4️⃣ Sample Config Files:
|
||||
|
||||
browser.yml:
|
||||
headless: true
|
||||
viewport_width: 1280
|
||||
user_agent_mode: "random"
|
||||
verbose: true
|
||||
ignore_https_errors: true
|
||||
|
||||
extract_css.yml:
|
||||
type: "json-css"
|
||||
params:
|
||||
verbose: true
|
||||
|
||||
css_schema.json:
|
||||
{
|
||||
"name": "ArticleExtractor",
|
||||
"baseSelector": ".article",
|
||||
"fields": [
|
||||
{
|
||||
"name": "title",
|
||||
"selector": "h1.title",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "link",
|
||||
"selector": "a.read-more",
|
||||
"type": "attribute",
|
||||
"attribute": "href"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
extract_llm.yml:
|
||||
type: "llm"
|
||||
provider: "openai/gpt-4"
|
||||
instruction: "Extract all articles with their titles and links"
|
||||
api_token: "your-token"
|
||||
params:
|
||||
temperature: 0.3
|
||||
max_tokens: 1000
|
||||
|
||||
llm_schema.json:
|
||||
{
|
||||
"title": "Article",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {
|
||||
"type": "string",
|
||||
"description": "The title of the article"
|
||||
},
|
||||
"link": {
|
||||
"type": "string",
|
||||
"description": "URL to the full article"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
5️⃣ Advanced Usage:
|
||||
# Combine configs with direct parameters
|
||||
crwl https://example.com -B browser.yml -b "headless=false,viewport_width=1920"
|
||||
|
||||
# Full extraction pipeline
|
||||
crwl https://example.com \\
|
||||
-B browser.yml \\
|
||||
-C crawler.yml \\
|
||||
-e extract_llm.yml \\
|
||||
-s llm_schema.json \\
|
||||
-o json \\
|
||||
-v
|
||||
|
||||
# Content filtering with BM25
|
||||
crwl https://example.com \\
|
||||
-f filter_bm25.yml \\
|
||||
-o markdown-fit
|
||||
|
||||
For more documentation visit: https://github.com/unclecode/crawl4ai
|
||||
|
||||
6️⃣ Q&A with LLM:
|
||||
# Ask a question about the content
|
||||
crwl https://example.com -q "What is the main topic discussed?"
|
||||
|
||||
# First view content, then ask questions
|
||||
crwl https://example.com -o markdown # See the crawled content first
|
||||
crwl https://example.com -q "Summarize the key points"
|
||||
crwl https://example.com -q "What are the conclusions?"
|
||||
|
||||
# Advanced crawling with Q&A
|
||||
crwl https://example.com \\
|
||||
-B browser.yml \\
|
||||
-c "css_selector=article,scan_full_page=true" \\
|
||||
-q "What are the pros and cons mentioned?"
|
||||
|
||||
Note: First time using -q will prompt for LLM provider and API token.
|
||||
These will be saved in ~/.crawl4ai/global.yml for future use.
|
||||
|
||||
Supported provider format: 'company/model'
|
||||
Examples:
|
||||
- ollama/llama3.3
|
||||
- openai/gpt-4
|
||||
- anthropic/claude-3-sonnet
|
||||
- cohere/command
|
||||
- google/gemini-pro
|
||||
|
||||
See full list of providers: https://docs.litellm.ai/docs/providers
|
||||
"""
|
||||
click.echo(examples)
|
||||
|
||||
@click.command(context_settings={"help_option_names": ["-h", "--help"]})
|
||||
@click.argument("url", required=False)
|
||||
@click.option("--example", is_flag=True, help="Show usage examples")
|
||||
@click.option("--browser-config", "-B", type=click.Path(exists=True), help="Browser config file (YAML/JSON)")
|
||||
@click.option("--crawler-config", "-C", type=click.Path(exists=True), help="Crawler config file (YAML/JSON)")
|
||||
@click.option("--filter-config", "-f", type=click.Path(exists=True), help="Content filter config file")
|
||||
@click.option("--extraction-config", "-e", type=click.Path(exists=True), help="Extraction strategy config file")
|
||||
@click.option("--schema", "-s", type=click.Path(exists=True), help="JSON schema for extraction")
|
||||
@click.option("--browser", "-b", type=str, callback=parse_key_values, help="Browser parameters as key1=value1,key2=value2")
|
||||
@click.option("--crawler", "-c", type=str, callback=parse_key_values, help="Crawler parameters as key1=value1,key2=value2")
|
||||
@click.option("--output", "-o", type=click.Choice(["all", "json", "markdown", "markdown-v2", "md", "md-fit"]), default="all")
|
||||
@click.option("--bypass-cache", is_flag=True, default = True, help="Bypass cache when crawling")
|
||||
@click.option("--question", "-q", help="Ask a question about the crawled content")
|
||||
@click.option("--verbose", "-v", is_flag=True)
|
||||
def cli(url: str, example: bool, browser_config: str, crawler_config: str, filter_config: str,
|
||||
extraction_config: str, schema: str, browser: Dict, crawler: Dict,
|
||||
output: str, bypass_cache: bool, question: str, verbose: bool):
|
||||
"""Crawl4AI CLI - Web content extraction tool
|
||||
|
||||
Simple Usage:
|
||||
crwl https://example.com
|
||||
|
||||
Run with --example to see detailed usage examples."""
|
||||
|
||||
if example:
|
||||
show_examples()
|
||||
return
|
||||
|
||||
if not url:
|
||||
raise click.UsageError("URL argument is required unless using --example")
|
||||
|
||||
try:
|
||||
# Load base configurations
|
||||
browser_cfg = BrowserConfig.load(load_config_file(browser_config))
|
||||
crawler_cfg = CrawlerRunConfig.load(load_config_file(crawler_config))
|
||||
|
||||
# Override with CLI params
|
||||
if browser:
|
||||
browser_cfg = browser_cfg.clone(**browser)
|
||||
if crawler:
|
||||
crawler_cfg = crawler_cfg.clone(**crawler)
|
||||
|
||||
# Handle content filter config
|
||||
if filter_config:
|
||||
filter_conf = load_config_file(filter_config)
|
||||
if filter_conf["type"] == "bm25":
|
||||
crawler_cfg.content_filter = BM25ContentFilter(
|
||||
user_query=filter_conf.get("query"),
|
||||
bm25_threshold=filter_conf.get("threshold", 1.0)
|
||||
)
|
||||
elif filter_conf["type"] == "pruning":
|
||||
crawler_cfg.content_filter = PruningContentFilter(
|
||||
user_query=filter_conf.get("query"),
|
||||
threshold=filter_conf.get("threshold", 0.48)
|
||||
)
|
||||
|
||||
# Handle extraction strategy
|
||||
if extraction_config:
|
||||
extract_conf = load_config_file(extraction_config)
|
||||
schema_data = load_schema_file(schema)
|
||||
|
||||
# Check if type does not exist show proper message
|
||||
if not extract_conf.get("type"):
|
||||
raise click.ClickException("Extraction type not specified")
|
||||
if extract_conf["type"] not in ["llm", "json-css", "json-xpath"]:
|
||||
raise click.ClickException(f"Invalid extraction type: {extract_conf['type']}")
|
||||
|
||||
if extract_conf["type"] == "llm":
|
||||
# if no provider show error emssage
|
||||
if not extract_conf.get("provider") or not extract_conf.get("api_token"):
|
||||
raise click.ClickException("LLM provider and API token are required for LLM extraction")
|
||||
|
||||
crawler_cfg.extraction_strategy = LLMExtractionStrategy(
|
||||
provider=extract_conf["provider"],
|
||||
instruction=extract_conf["instruction"],
|
||||
api_token=extract_conf.get("api_token", extract_conf.get("api_key")),
|
||||
schema=schema_data,
|
||||
**extract_conf.get("params", {})
|
||||
)
|
||||
elif extract_conf["type"] == "json-css":
|
||||
crawler_cfg.extraction_strategy = JsonCssExtractionStrategy(
|
||||
schema=schema_data
|
||||
)
|
||||
elif extract_conf["type"] == "json-xpath":
|
||||
crawler_cfg.extraction_strategy = JsonXPathExtractionStrategy(
|
||||
schema=schema_data
|
||||
)
|
||||
|
||||
|
||||
# No cache
|
||||
if bypass_cache:
|
||||
crawler_cfg.cache_mode = CacheMode.BYPASS
|
||||
|
||||
# Run crawler
|
||||
result : CrawlResult = anyio.run(
|
||||
run_crawler,
|
||||
url,
|
||||
browser_cfg,
|
||||
crawler_cfg,
|
||||
verbose
|
||||
)
|
||||
|
||||
# Handle question
|
||||
if question:
|
||||
provider, token = setup_llm_config()
|
||||
markdown = result.markdown_v2.raw_markdown
|
||||
anyio.run(stream_llm_response, url, markdown, question, provider, token)
|
||||
return
|
||||
|
||||
# Handle output
|
||||
if output == "all":
|
||||
click.echo(json.dumps(result.model_dump(), indent=2))
|
||||
elif output == "json":
|
||||
click.echo(json.dumps(json.loads(result.extracted_content), indent=2))
|
||||
elif output in ["markdown", "md"]:
|
||||
click.echo(result.markdown_v2.raw_markdown)
|
||||
elif output in ["markdown-fit", "md-fit"]:
|
||||
click.echo(result.markdown_v2.fit_markdown)
|
||||
|
||||
except Exception as e:
|
||||
raise click.ClickException(str(e))
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli()
|
||||
@ -15,6 +15,8 @@ PROVIDER_MODELS = {
|
||||
"openai/gpt-4o": os.getenv("OPENAI_API_KEY"),
|
||||
"openai/o1-mini": os.getenv("OPENAI_API_KEY"),
|
||||
"openai/o1-preview": os.getenv("OPENAI_API_KEY"),
|
||||
"openai/o3-mini": os.getenv("OPENAI_API_KEY"),
|
||||
"openai/o3-mini-high": os.getenv("OPENAI_API_KEY"),
|
||||
"anthropic/claude-3-haiku-20240307": os.getenv("ANTHROPIC_API_KEY"),
|
||||
"anthropic/claude-3-opus-20240229": os.getenv("ANTHROPIC_API_KEY"),
|
||||
"anthropic/claude-3-sonnet-20240229": os.getenv("ANTHROPIC_API_KEY"),
|
||||
|
||||
@ -533,11 +533,15 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
self.provider = provider
|
||||
self.api_token = (
|
||||
api_token
|
||||
or PROVIDER_MODELS.get(provider, "no-token")
|
||||
or os.getenv("OPENAI_API_KEY")
|
||||
)
|
||||
if api_token and not api_token.startswith("env:"):
|
||||
self.api_token = api_token
|
||||
elif api_token and api_token.startswith("env:"):
|
||||
self.api_token = os.getenv(api_token[4:])
|
||||
else:
|
||||
self.api_token = (
|
||||
PROVIDER_MODELS.get(provider, "no-token")
|
||||
or os.getenv("OPENAI_API_KEY")
|
||||
)
|
||||
self.instruction = instruction
|
||||
self.extract_type = extraction_type
|
||||
self.schema = schema
|
||||
|
||||
@ -2,14 +2,47 @@ import subprocess
|
||||
import sys
|
||||
import asyncio
|
||||
from .async_logger import AsyncLogger, LogLevel
|
||||
from pathlib import Path
|
||||
import os
|
||||
import shutil
|
||||
|
||||
# Initialize logger
|
||||
logger = AsyncLogger(log_level=LogLevel.DEBUG, verbose=True)
|
||||
|
||||
def setup_home_directory():
|
||||
"""Set up the .crawl4ai folder structure in the user's home directory."""
|
||||
base_dir = os.getenv("CRAWL4_AI_BASE_DIRECTORY")
|
||||
crawl4ai_folder = Path(base_dir) if base_dir else Path.home()
|
||||
crawl4ai_config = crawl4ai_folder / "global.yml"
|
||||
crawl4ai_folder = crawl4ai_folder / ".crawl4ai"
|
||||
cache_folder = crawl4ai_folder / "cache"
|
||||
content_folders = [
|
||||
"html_content",
|
||||
"cleaned_html",
|
||||
"markdown_content",
|
||||
"extracted_content",
|
||||
"screenshots",
|
||||
]
|
||||
|
||||
# Clean up old cache if exists
|
||||
if cache_folder.exists():
|
||||
shutil.rmtree(cache_folder)
|
||||
|
||||
# Create new folder structure
|
||||
crawl4ai_folder.mkdir(exist_ok=True)
|
||||
cache_folder.mkdir(exist_ok=True)
|
||||
for folder in content_folders:
|
||||
(crawl4ai_folder / folder).mkdir(exist_ok=True)
|
||||
|
||||
# If config file does not exist, create it
|
||||
if not crawl4ai_config.exists():
|
||||
with open(crawl4ai_config, "w") as f:
|
||||
f.write("")
|
||||
|
||||
def post_install():
|
||||
"""Run all post-installation tasks"""
|
||||
logger.info("Running post-installation setup...", tag="INIT")
|
||||
setup_home_directory()
|
||||
install_playwright()
|
||||
run_migration()
|
||||
logger.success("Post-installation setup completed!", tag="COMPLETE")
|
||||
|
||||
13
docs/examples/cli/browser.yml
Normal file
13
docs/examples/cli/browser.yml
Normal file
@ -0,0 +1,13 @@
|
||||
browser_type: "chromium"
|
||||
headless: true
|
||||
viewport_width: 1280
|
||||
viewport_height: 800
|
||||
user_agent_mode: "random"
|
||||
verbose: true
|
||||
text_mode: false
|
||||
light_mode: false
|
||||
ignore_https_errors: true
|
||||
java_script_enabled: true
|
||||
extra_args:
|
||||
- "--disable-gpu"
|
||||
- "--no-sandbox"
|
||||
13
docs/examples/cli/crawler.yml
Normal file
13
docs/examples/cli/crawler.yml
Normal file
@ -0,0 +1,13 @@
|
||||
cache_mode: "bypass"
|
||||
wait_until: "networkidle"
|
||||
page_timeout: 30000
|
||||
delay_before_return_html: 0.5
|
||||
word_count_threshold: 100
|
||||
scan_full_page: true
|
||||
scroll_delay: 0.3
|
||||
process_iframes: false
|
||||
remove_overlay_elements: true
|
||||
magic: true
|
||||
verbose: true
|
||||
exclude_external_links: true
|
||||
exclude_social_media_links: true
|
||||
27
docs/examples/cli/css_schema.json
Normal file
27
docs/examples/cli/css_schema.json
Normal file
@ -0,0 +1,27 @@
|
||||
{
|
||||
"name": "ArticleExtractor",
|
||||
"baseSelector": ".cards[data-tax=news] .card__data",
|
||||
"fields": [
|
||||
{
|
||||
"name": "title",
|
||||
"selector": "h4.card__title",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "link",
|
||||
"selector": "h4.card__title a",
|
||||
"type": "attribute",
|
||||
"attribute": "href"
|
||||
},
|
||||
{
|
||||
"name": "details",
|
||||
"selector": ".card__details",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "topics",
|
||||
"selector": ".card__topics.topics",
|
||||
"type": "text"
|
||||
}
|
||||
]
|
||||
}
|
||||
11
docs/examples/cli/extract.yml
Normal file
11
docs/examples/cli/extract.yml
Normal file
@ -0,0 +1,11 @@
|
||||
type: "llm"
|
||||
provider: "openai/gpt-4o-mini"
|
||||
api_token: "env:OPENAI_API_KEY"
|
||||
instruction: "Extract all articles with their titles, authors, publication dates and main topics in a structured format"
|
||||
params:
|
||||
chunk_token_threshold: 4096
|
||||
overlap_rate: 0.1
|
||||
word_token_rate: 0.75
|
||||
temperature: 0.3
|
||||
max_tokens: 1000
|
||||
verbose: true
|
||||
3
docs/examples/cli/extract_css.yml
Normal file
3
docs/examples/cli/extract_css.yml
Normal file
@ -0,0 +1,3 @@
|
||||
type: "json-css"
|
||||
params:
|
||||
verbose: true
|
||||
26
docs/examples/cli/llm_schema.json
Normal file
26
docs/examples/cli/llm_schema.json
Normal file
@ -0,0 +1,26 @@
|
||||
{
|
||||
"title": "NewsArticle",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {
|
||||
"type": "string",
|
||||
"description": "The title/headline of the news article"
|
||||
},
|
||||
"link": {
|
||||
"type": "string",
|
||||
"description": "The URL or link to the full article"
|
||||
},
|
||||
"details": {
|
||||
"type": "string",
|
||||
"description": "Brief summary or details about the article content"
|
||||
},
|
||||
"topics": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "List of topics or categories associated with the article"
|
||||
}
|
||||
},
|
||||
"required": ["title", "details"]
|
||||
}
|
||||
304
docs/md_v2/core/cli.md
Normal file
304
docs/md_v2/core/cli.md
Normal file
@ -0,0 +1,304 @@
|
||||
# Crawl4AI CLI Guide
|
||||
|
||||
## Table of Contents
|
||||
- [Installation](#installation)
|
||||
- [Basic Usage](#basic-usage)
|
||||
- [Configuration](#configuration)
|
||||
- [Browser Configuration](#browser-configuration)
|
||||
- [Crawler Configuration](#crawler-configuration)
|
||||
- [Extraction Configuration](#extraction-configuration)
|
||||
- [Content Filtering](#content-filtering)
|
||||
- [Advanced Features](#advanced-features)
|
||||
- [LLM Q&A](#llm-qa)
|
||||
- [Structured Data Extraction](#structured-data-extraction)
|
||||
- [Content Filtering](#content-filtering-1)
|
||||
- [Output Formats](#output-formats)
|
||||
- [Examples](#examples)
|
||||
- [Configuration Reference](#configuration-reference)
|
||||
- [Best Practices & Tips](#best-practices--tips)
|
||||
|
||||
## Basic Usage
|
||||
|
||||
The Crawl4AI CLI (`crwl`) provides a simple interface to the Crawl4AI library:
|
||||
|
||||
```bash
|
||||
# Basic crawling
|
||||
crwl https://example.com
|
||||
|
||||
# Get markdown output
|
||||
crwl https://example.com -o markdown
|
||||
|
||||
# Verbose JSON output with cache bypass
|
||||
crwl https://example.com -o json -v --bypass-cache
|
||||
|
||||
# See usage examples
|
||||
crwl --example
|
||||
```
|
||||
|
||||
## Quick Example of Advanced Usage
|
||||
|
||||
If you clone the repository and run the following command, you will receive the content of the page in JSON format according to a JSON-CSS schema:
|
||||
|
||||
```bash
|
||||
crwl "https://www.infoq.com/ai-ml-data-eng/" -e docs/examples/cli/extract_css.yml -s docs/examples/cli/css_schema.json -o json;
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
### Browser Configuration
|
||||
|
||||
Browser settings can be configured via YAML file or command line parameters:
|
||||
|
||||
```yaml
|
||||
# browser.yml
|
||||
headless: true
|
||||
viewport_width: 1280
|
||||
user_agent_mode: "random"
|
||||
verbose: true
|
||||
ignore_https_errors: true
|
||||
```
|
||||
|
||||
```bash
|
||||
# Using config file
|
||||
crwl https://example.com -B browser.yml
|
||||
|
||||
# Using direct parameters
|
||||
crwl https://example.com -b "headless=true,viewport_width=1280,user_agent_mode=random"
|
||||
```
|
||||
|
||||
### Crawler Configuration
|
||||
|
||||
Control crawling behavior:
|
||||
|
||||
```yaml
|
||||
# crawler.yml
|
||||
cache_mode: "bypass"
|
||||
wait_until: "networkidle"
|
||||
page_timeout: 30000
|
||||
delay_before_return_html: 0.5
|
||||
word_count_threshold: 100
|
||||
scan_full_page: true
|
||||
scroll_delay: 0.3
|
||||
process_iframes: false
|
||||
remove_overlay_elements: true
|
||||
magic: true
|
||||
verbose: true
|
||||
```
|
||||
|
||||
```bash
|
||||
# Using config file
|
||||
crwl https://example.com -C crawler.yml
|
||||
|
||||
# Using direct parameters
|
||||
crwl https://example.com -c "css_selector=#main,delay_before_return_html=2,scan_full_page=true"
|
||||
```
|
||||
|
||||
### Extraction Configuration
|
||||
|
||||
Two types of extraction are supported:
|
||||
|
||||
1. CSS/XPath-based extraction:
|
||||
```yaml
|
||||
# extract_css.yml
|
||||
type: "json-css"
|
||||
params:
|
||||
verbose: true
|
||||
```
|
||||
|
||||
```json
|
||||
// css_schema.json
|
||||
{
|
||||
"name": "ArticleExtractor",
|
||||
"baseSelector": ".article",
|
||||
"fields": [
|
||||
{
|
||||
"name": "title",
|
||||
"selector": "h1.title",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "link",
|
||||
"selector": "a.read-more",
|
||||
"type": "attribute",
|
||||
"attribute": "href"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
2. LLM-based extraction:
|
||||
```yaml
|
||||
# extract_llm.yml
|
||||
type: "llm"
|
||||
provider: "openai/gpt-4"
|
||||
instruction: "Extract all articles with their titles and links"
|
||||
api_token: "your-token"
|
||||
params:
|
||||
temperature: 0.3
|
||||
max_tokens: 1000
|
||||
```
|
||||
|
||||
```json
|
||||
// llm_schema.json
|
||||
{
|
||||
"title": "Article",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {
|
||||
"type": "string",
|
||||
"description": "The title of the article"
|
||||
},
|
||||
"link": {
|
||||
"type": "string",
|
||||
"description": "URL to the full article"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Advanced Features
|
||||
|
||||
### LLM Q&A
|
||||
|
||||
Ask questions about crawled content:
|
||||
|
||||
```bash
|
||||
# Simple question
|
||||
crwl https://example.com -q "What is the main topic discussed?"
|
||||
|
||||
# View content then ask questions
|
||||
crwl https://example.com -o markdown # See content first
|
||||
crwl https://example.com -q "Summarize the key points"
|
||||
crwl https://example.com -q "What are the conclusions?"
|
||||
|
||||
# Combined with advanced crawling
|
||||
crwl https://example.com \
|
||||
-B browser.yml \
|
||||
-c "css_selector=article,scan_full_page=true" \
|
||||
-q "What are the pros and cons mentioned?"
|
||||
```
|
||||
|
||||
First-time setup:
|
||||
- Prompts for LLM provider and API token
|
||||
- Saves configuration in `~/.crawl4ai/global.yml`
|
||||
- Supports various providers (openai/gpt-4, anthropic/claude-3-sonnet, etc.)
|
||||
- For case of `ollama` you do not need to provide API token.
|
||||
- See [LiteLLM Providers](https://docs.litellm.ai/docs/providers) for full list
|
||||
|
||||
### Structured Data Extraction
|
||||
|
||||
Extract structured data using CSS selectors:
|
||||
|
||||
```bash
|
||||
crwl https://example.com \
|
||||
-e extract_css.yml \
|
||||
-s css_schema.json \
|
||||
-o json
|
||||
```
|
||||
|
||||
Or using LLM-based extraction:
|
||||
|
||||
```bash
|
||||
crwl https://example.com \
|
||||
-e extract_llm.yml \
|
||||
-s llm_schema.json \
|
||||
-o json
|
||||
```
|
||||
|
||||
### Content Filtering
|
||||
|
||||
Filter content for relevance:
|
||||
|
||||
```yaml
|
||||
# filter_bm25.yml
|
||||
type: "bm25"
|
||||
query: "target content"
|
||||
threshold: 1.0
|
||||
|
||||
# filter_pruning.yml
|
||||
type: "pruning"
|
||||
query: "focus topic"
|
||||
threshold: 0.48
|
||||
```
|
||||
|
||||
```bash
|
||||
crwl https://example.com -f filter_bm25.yml -o markdown-fit
|
||||
```
|
||||
|
||||
## Output Formats
|
||||
|
||||
- `all` - Full crawl result including metadata
|
||||
- `json` - Extracted structured data (when using extraction)
|
||||
- `markdown` / `md` - Raw markdown output
|
||||
- `markdown-fit` / `md-fit` - Filtered markdown for better readability
|
||||
|
||||
## Complete Examples
|
||||
|
||||
1. Basic Extraction:
|
||||
```bash
|
||||
crwl https://example.com \
|
||||
-B browser.yml \
|
||||
-C crawler.yml \
|
||||
-o json
|
||||
```
|
||||
|
||||
2. Structured Data Extraction:
|
||||
```bash
|
||||
crwl https://example.com \
|
||||
-e extract_css.yml \
|
||||
-s css_schema.json \
|
||||
-o json \
|
||||
-v
|
||||
```
|
||||
|
||||
3. LLM Extraction with Filtering:
|
||||
```bash
|
||||
crwl https://example.com \
|
||||
-B browser.yml \
|
||||
-e extract_llm.yml \
|
||||
-s llm_schema.json \
|
||||
-f filter_bm25.yml \
|
||||
-o json
|
||||
```
|
||||
|
||||
4. Interactive Q&A:
|
||||
```bash
|
||||
# First crawl and view
|
||||
crwl https://example.com -o markdown
|
||||
|
||||
# Then ask questions
|
||||
crwl https://example.com -q "What are the main points?"
|
||||
crwl https://example.com -q "Summarize the conclusions"
|
||||
```
|
||||
|
||||
## Best Practices & Tips
|
||||
|
||||
1. **Configuration Management**:
|
||||
- Keep common configurations in YAML files
|
||||
- Use CLI parameters for quick overrides
|
||||
- Store sensitive data (API tokens) in `~/.crawl4ai/global.yml`
|
||||
|
||||
2. **Performance Optimization**:
|
||||
- Use `--bypass-cache` for fresh content
|
||||
- Enable `scan_full_page` for infinite scroll pages
|
||||
- Adjust `delay_before_return_html` for dynamic content
|
||||
|
||||
3. **Content Extraction**:
|
||||
- Use CSS extraction for structured content
|
||||
- Use LLM extraction for unstructured content
|
||||
- Combine with filters for focused results
|
||||
|
||||
4. **Q&A Workflow**:
|
||||
- View content first with `-o markdown`
|
||||
- Ask specific questions
|
||||
- Use broader context with appropriate selectors
|
||||
|
||||
## Recap
|
||||
|
||||
The Crawl4AI CLI provides:
|
||||
- Flexible configuration via files and parameters
|
||||
- Multiple extraction strategies (CSS, XPath, LLM)
|
||||
- Content filtering and optimization
|
||||
- Interactive Q&A capabilities
|
||||
- Various output formats
|
||||
|
||||
@ -15,6 +15,7 @@ nav:
|
||||
- "Blog Home": "blog/index.md"
|
||||
- "Changelog": "https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md"
|
||||
- Core:
|
||||
- "Command Line Interface": "core/cli.md"
|
||||
- "Simple Crawling": "core/simple-crawling.md"
|
||||
- "Crawler Result": "core/crawler-result.md"
|
||||
- "Browser & Crawler Config": "core/browser-crawler-config.md"
|
||||
|
||||
133
tests/cli/test_cli.py
Normal file
133
tests/cli/test_cli.py
Normal file
@ -0,0 +1,133 @@
|
||||
import pytest
|
||||
from click.testing import CliRunner
|
||||
from pathlib import Path
|
||||
import json
|
||||
import yaml
|
||||
from crawl4ai.cli import cli, load_config_file, parse_key_values
|
||||
import tempfile
|
||||
import os
|
||||
import click
|
||||
|
||||
@pytest.fixture
|
||||
def runner():
|
||||
return CliRunner()
|
||||
|
||||
@pytest.fixture
|
||||
def temp_config_dir():
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
old_home = os.environ.get('HOME')
|
||||
os.environ['HOME'] = tmpdir
|
||||
yield Path(tmpdir)
|
||||
if old_home:
|
||||
os.environ['HOME'] = old_home
|
||||
|
||||
@pytest.fixture
|
||||
def sample_configs(temp_config_dir):
|
||||
configs = {
|
||||
'browser.yml': {
|
||||
'headless': True,
|
||||
'viewport_width': 1280,
|
||||
'user_agent_mode': 'random'
|
||||
},
|
||||
'crawler.yml': {
|
||||
'cache_mode': 'bypass',
|
||||
'wait_until': 'networkidle',
|
||||
'scan_full_page': True
|
||||
},
|
||||
'extract_css.yml': {
|
||||
'type': 'json-css',
|
||||
'params': {'verbose': True}
|
||||
},
|
||||
'css_schema.json': {
|
||||
'name': 'ArticleExtractor',
|
||||
'baseSelector': '.article',
|
||||
'fields': [
|
||||
{'name': 'title', 'selector': 'h1.title', 'type': 'text'},
|
||||
{'name': 'link', 'selector': 'a.read-more', 'type': 'attribute', 'attribute': 'href'}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
for filename, content in configs.items():
|
||||
path = temp_config_dir / filename
|
||||
with open(path, 'w') as f:
|
||||
if filename.endswith('.yml'):
|
||||
yaml.dump(content, f)
|
||||
else:
|
||||
json.dump(content, f)
|
||||
|
||||
return {name: str(temp_config_dir / name) for name in configs}
|
||||
|
||||
class TestCLIBasics:
|
||||
def test_help(self, runner):
|
||||
result = runner.invoke(cli, ['--help'])
|
||||
assert result.exit_code == 0
|
||||
assert 'Crawl4AI CLI' in result.output
|
||||
|
||||
def test_examples(self, runner):
|
||||
result = runner.invoke(cli, ['--example'])
|
||||
assert result.exit_code == 0
|
||||
assert 'Examples' in result.output
|
||||
|
||||
def test_missing_url(self, runner):
|
||||
result = runner.invoke(cli)
|
||||
assert result.exit_code != 0
|
||||
assert 'URL argument is required' in result.output
|
||||
|
||||
class TestConfigParsing:
|
||||
def test_parse_key_values_basic(self):
|
||||
result = parse_key_values(None, None, "key1=value1,key2=true")
|
||||
assert result == {'key1': 'value1', 'key2': True}
|
||||
|
||||
def test_parse_key_values_invalid(self):
|
||||
with pytest.raises(click.BadParameter):
|
||||
parse_key_values(None, None, "invalid_format")
|
||||
|
||||
class TestConfigLoading:
|
||||
def test_load_yaml_config(self, sample_configs):
|
||||
config = load_config_file(sample_configs['browser.yml'])
|
||||
assert config['headless'] is True
|
||||
assert config['viewport_width'] == 1280
|
||||
|
||||
def test_load_json_config(self, sample_configs):
|
||||
config = load_config_file(sample_configs['css_schema.json'])
|
||||
assert config['name'] == 'ArticleExtractor'
|
||||
assert len(config['fields']) == 2
|
||||
|
||||
def test_load_nonexistent_config(self):
|
||||
with pytest.raises(click.BadParameter):
|
||||
load_config_file('nonexistent.yml')
|
||||
|
||||
class TestLLMConfig:
|
||||
def test_llm_config_creation(self, temp_config_dir, runner):
|
||||
def input_simulation(inputs):
|
||||
return runner.invoke(cli, ['https://example.com', '-q', 'test question'],
|
||||
input='\n'.join(inputs))
|
||||
|
||||
class TestCrawlingFeatures:
|
||||
def test_basic_crawl(self, runner):
|
||||
result = runner.invoke(cli, ['https://example.com'])
|
||||
assert result.exit_code == 0
|
||||
|
||||
|
||||
class TestErrorHandling:
|
||||
def test_invalid_config_file(self, runner):
|
||||
result = runner.invoke(cli, [
|
||||
'https://example.com',
|
||||
'--browser-config', 'nonexistent.yml'
|
||||
])
|
||||
assert result.exit_code != 0
|
||||
|
||||
def test_invalid_schema(self, runner, temp_config_dir):
|
||||
invalid_schema = temp_config_dir / 'invalid_schema.json'
|
||||
with open(invalid_schema, 'w') as f:
|
||||
f.write('invalid json')
|
||||
|
||||
result = runner.invoke(cli, [
|
||||
'https://example.com',
|
||||
'--schema', str(invalid_schema)
|
||||
])
|
||||
assert result.exit_code != 0
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main(['-v', '-s', '--tb=native', __file__])
|
||||
Loading…
x
Reference in New Issue
Block a user