feat(profiles): add CLI command for crawling with browser profiles

Adds new functionality to crawl websites using saved browser profiles directly from the CLI.
This includes:
- New CLI option to use profiles for crawling
- Helper functions for profile-based crawling
- Fixed type hints for config parameters
- Updated example to show browser window by default

This makes it easier for users to leverage saved browser profiles for crawling without writing code.
This commit is contained in:
UncleCode 2025-03-02 21:33:33 +08:00
parent 95175cb394
commit c612f9a852
4 changed files with 87 additions and 7 deletions

View File

@ -222,7 +222,7 @@ class AsyncWebCrawler:
async def arun(
self,
url: str,
config: Optional[CrawlerRunConfig] = None,
config: CrawlerRunConfig = None,
# Legacy parameters maintained for backwards compatibility
word_count_threshold=MIN_WORD_THRESHOLD,
extraction_strategy: ExtractionStrategy = None,
@ -270,7 +270,7 @@ class AsyncWebCrawler:
Returns:
CrawlResult: The result of crawling and processing
"""
crawler_config = config
crawler_config = config or CrawlerRunConfig()
if not isinstance(url, str) or not url:
raise ValueError("Invalid URL, make sure the URL is a non-empty string")

View File

@ -431,6 +431,81 @@ def delete_profile_interactive(profiler: BrowserProfiler):
except (ValueError, IndexError):
console.print("[red]Invalid selection.[/red]")
async def crawl_with_profile_cli(profile_path, url):
"""Use a profile to crawl a website via CLI"""
console.print(f"[cyan]Crawling [bold]{url}[/bold] using profile at [bold]{profile_path}[/bold][/cyan]")
# Create browser config with the profile
browser_cfg = BrowserConfig(
headless=False, # Set to False to see the browser in action
use_managed_browser=True,
user_data_dir=profile_path
)
# Default crawler config
crawler_cfg = CrawlerRunConfig()
# Ask for output format
output_format = Prompt.ask(
"[cyan]Output format[/cyan]",
choices=["all", "json", "markdown", "md", "title"],
default="markdown"
)
try:
# Run the crawler
result = await run_crawler(url, browser_cfg, crawler_cfg, True)
# Handle output
if output_format == "all":
console.print(json.dumps(result.model_dump(), indent=2))
elif output_format == "json":
console.print(json.dumps(json.loads(result.extracted_content), indent=2))
elif output_format in ["markdown", "md"]:
console.print(result.markdown.raw_markdown)
elif output_format == "title":
console.print(result.metadata.get("title", "No title found"))
console.print(f"[green]Successfully crawled[/green] {url}")
return result
except Exception as e:
console.print(f"[red]Error crawling:[/red] {str(e)}")
return None
async def use_profile_to_crawl():
"""Interactive profile selection for crawling"""
profiler = BrowserProfiler()
profiles = profiler.list_profiles()
if not profiles:
console.print("[yellow]No profiles found. Create one first.[/yellow]")
return
# Display profiles
display_profiles_table(profiles)
# Get profile selection
idx = Prompt.ask(
"[cyan]Enter number of profile to use[/cyan]",
console=console,
choices=[str(i+1) for i in range(len(profiles))],
show_choices=False
)
try:
idx = int(idx) - 1
profile = profiles[idx]
# Get URL
url = Prompt.ask("[cyan]Enter URL to crawl[/cyan]")
if url:
# Crawl with the selected profile
await crawl_with_profile_cli(profile["path"], url)
else:
console.print("[red]No URL provided[/red]")
except (ValueError, IndexError):
console.print("[red]Invalid selection[/red]")
async def manage_profiles():
"""Interactive profile management menu"""
profiler = BrowserProfiler()
@ -439,14 +514,15 @@ async def manage_profiles():
"1": "List profiles",
"2": "Create new profile",
"3": "Delete profile",
"4": "Exit",
"4": "Use a profile to crawl a website",
"5": "Exit",
}
while True:
console.print(Panel("[bold cyan]Browser Profile Manager[/bold cyan]", border_style="cyan"))
for key, value in options.items():
color = "green" if key == "1" else "yellow" if key == "2" else "red" if key == "3" else "cyan"
color = "green" if key == "1" else "yellow" if key == "2" else "red" if key == "3" else "blue" if key == "4" else "cyan"
console.print(f"[{color}]{key}[/{color}]. {value}")
choice = Prompt.ask("Enter choice", choices=list(options.keys()), default="1")
@ -465,6 +541,10 @@ async def manage_profiles():
delete_profile_interactive(profiler)
elif choice == "4":
# Use profile to crawl
await use_profile_to_crawl()
elif choice == "5":
# Exit
console.print("[cyan]Exiting profile manager.[/cyan]")
break

View File

@ -16,7 +16,7 @@ class DeepCrawlDecorator:
def __call__(self, original_arun):
@wraps(original_arun)
async def wrapped_arun(url: str, config: Optional[CrawlerRunConfig] = None, **kwargs):
async def wrapped_arun(url: str, config: CrawlerRunConfig = None, **kwargs):
# If deep crawling is already active, call the original method to avoid recursion.
if config and config.deep_crawl_strategy and not self.deep_crawl_active.get():
token = self.deep_crawl_active.set(True)

View File

@ -32,7 +32,7 @@ async def crawl_with_profile(profile_path, url):
# Create browser config with the profile path
browser_config = BrowserConfig(
headless=True, # Set to False if you want to see the browser window
headless=False, # Set to False if you want to see the browser window
use_managed_browser=True, # Required for persistent profiles
user_data_dir=profile_path
)