feat(browser): add builtin browser management system

Implements a persistent browser management system that allows running a single shared browser instance
that can be reused across multiple crawler sessions. Key changes include:

- Added browser_mode config option with 'builtin', 'dedicated', and 'custom' modes
- Implemented builtin browser management in BrowserProfiler
- Added CLI commands for managing builtin browser (start, stop, status, restart, view)
- Modified browser process handling to support detached processes
- Added automatic builtin browser setup during package installation

BREAKING CHANGE: The browser_mode config option changes how browser instances are managed
This commit is contained in:
UncleCode 2025-03-20 12:13:59 +08:00
parent 5358ac0fc2
commit 6432ff1257
9 changed files with 1205 additions and 21 deletions

View File

@ -169,6 +169,11 @@ class BrowserConfig:
Default: "chromium".
headless (bool): Whether to run the browser in headless mode (no visible GUI).
Default: True.
browser_mode (str): Determines how the browser should be initialized:
"builtin" - use the builtin CDP browser running in background
"dedicated" - create a new dedicated browser instance each time
"custom" - use explicit CDP settings provided in cdp_url
Default: "dedicated"
use_managed_browser (bool): Launch the browser using a managed approach (e.g., via CDP), allowing
advanced manipulation. Default: False.
cdp_url (str): URL for the Chrome DevTools Protocol (CDP) endpoint. Default: "ws://localhost:9222/devtools/browser/".
@ -221,6 +226,7 @@ class BrowserConfig:
self,
browser_type: str = "chromium",
headless: bool = True,
browser_mode: str = "dedicated",
use_managed_browser: bool = False,
cdp_url: str = None,
use_persistent_context: bool = False,
@ -257,6 +263,7 @@ class BrowserConfig:
):
self.browser_type = browser_type
self.headless = headless
self.browser_mode = browser_mode
self.use_managed_browser = use_managed_browser
self.cdp_url = cdp_url
self.use_persistent_context = use_persistent_context
@ -290,6 +297,7 @@ class BrowserConfig:
self.sleep_on_close = sleep_on_close
self.verbose = verbose
self.debugging_port = debugging_port
self.host = host
fa_user_agenr_generator = ValidUAGenerator()
if self.user_agent_mode == "random":
@ -302,6 +310,18 @@ class BrowserConfig:
self.browser_hint = UAGen.generate_client_hints(self.user_agent)
self.headers.setdefault("sec-ch-ua", self.browser_hint)
# Set appropriate browser management flags based on browser_mode
if self.browser_mode == "builtin":
# Builtin mode uses managed browser connecting to builtin CDP endpoint
self.use_managed_browser = True
# cdp_url will be set later by browser_manager
elif self.browser_mode == "custom" and self.cdp_url:
# Custom mode with explicit CDP URL
self.use_managed_browser = True
elif self.browser_mode == "dedicated":
# Dedicated mode uses a new browser instance each time
pass
# If persistent context is requested, ensure managed browser is enabled
if self.use_persistent_context:
self.use_managed_browser = True
@ -311,6 +331,7 @@ class BrowserConfig:
return BrowserConfig(
browser_type=kwargs.get("browser_type", "chromium"),
headless=kwargs.get("headless", True),
browser_mode=kwargs.get("browser_mode", "dedicated"),
use_managed_browser=kwargs.get("use_managed_browser", False),
cdp_url=kwargs.get("cdp_url"),
use_persistent_context=kwargs.get("use_persistent_context", False),
@ -338,12 +359,15 @@ class BrowserConfig:
text_mode=kwargs.get("text_mode", False),
light_mode=kwargs.get("light_mode", False),
extra_args=kwargs.get("extra_args", []),
debugging_port=kwargs.get("debugging_port", 9222),
host=kwargs.get("host", "localhost"),
)
def to_dict(self):
return {
"browser_type": self.browser_type,
"headless": self.headless,
"browser_mode": self.browser_mode,
"use_managed_browser": self.use_managed_browser,
"cdp_url": self.cdp_url,
"use_persistent_context": self.use_persistent_context,
@ -370,6 +394,7 @@ class BrowserConfig:
"sleep_on_close": self.sleep_on_close,
"verbose": self.verbose,
"debugging_port": self.debugging_port,
"host": self.host,
}
def clone(self, **kwargs):

View File

@ -201,13 +201,35 @@ class AsyncWebCrawler:
This is equivalent to using 'async with' but gives more control over the lifecycle.
This method will:
1. Initialize the browser and context
2. Perform warmup sequence
3. Return the crawler instance for method chaining
1. Check for builtin browser if browser_mode is 'builtin'
2. Initialize the browser and context
3. Perform warmup sequence
4. Return the crawler instance for method chaining
Returns:
AsyncWebCrawler: The initialized crawler instance
"""
# Check for builtin browser if requested
if self.browser_config.browser_mode == "builtin" and not self.browser_config.cdp_url:
# Import here to avoid circular imports
from .browser_profiler import BrowserProfiler
profiler = BrowserProfiler(logger=self.logger)
# Get builtin browser info or launch if needed
browser_info = profiler.get_builtin_browser_info()
if not browser_info:
self.logger.info("Builtin browser not found, launching new instance...", tag="BROWSER")
cdp_url = await profiler.launch_builtin_browser()
if not cdp_url:
self.logger.warning("Failed to launch builtin browser, falling back to dedicated browser", tag="BROWSER")
else:
self.browser_config.cdp_url = cdp_url
self.browser_config.use_managed_browser = True
else:
self.logger.info(f"Using existing builtin browser at {browser_info.get('cdp_url')}", tag="BROWSER")
self.browser_config.cdp_url = browser_info.get('cdp_url')
self.browser_config.use_managed_browser = True
await self.crawler_strategy.__aenter__()
await self.awarmup()
return self
@ -280,6 +302,10 @@ class AsyncWebCrawler:
Returns:
CrawlResult: The result of crawling and processing
"""
# Auto-start if not ready
if not self.ready:
await self.start()
config = config or CrawlerRunConfig()
if not isinstance(url, str) or not url:
raise ValueError("Invalid URL, make sure the URL is a non-empty string")

View File

@ -145,17 +145,59 @@ class ManagedBrowser:
# Start browser process
try:
self.browser_process = subprocess.Popen(
args, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
# Monitor browser process output for errors
asyncio.create_task(self._monitor_browser_process())
# Use DETACHED_PROCESS flag on Windows to fully detach the process
# On Unix, we'll use preexec_fn=os.setpgrp to start the process in a new process group
if sys.platform == "win32":
self.browser_process = subprocess.Popen(
args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
creationflags=subprocess.DETACHED_PROCESS | subprocess.CREATE_NEW_PROCESS_GROUP
)
else:
self.browser_process = subprocess.Popen(
args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
preexec_fn=os.setpgrp # Start in a new process group
)
# We'll monitor for a short time to make sure it starts properly, but won't keep monitoring
await self._initial_startup_check()
await asyncio.sleep(2) # Give browser time to start
return f"http://{self.host}:{self.debugging_port}"
except Exception as e:
await self.cleanup()
raise Exception(f"Failed to start browser: {e}")
async def _initial_startup_check(self):
"""
Perform a quick check to make sure the browser started successfully.
This only runs once at startup rather than continuously monitoring.
"""
if not self.browser_process:
return
# Check that process started without immediate termination
await asyncio.sleep(0.5)
if self.browser_process.poll() is not None:
# Process already terminated
stdout, stderr = b"", b""
try:
stdout, stderr = self.browser_process.communicate(timeout=0.5)
except subprocess.TimeoutExpired:
pass
self.logger.error(
message="Browser process terminated during startup | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}",
tag="ERROR",
params={
"code": self.browser_process.returncode,
"stdout": stdout.decode() if stdout else "",
"stderr": stderr.decode() if stderr else "",
},
)
async def _monitor_browser_process(self):
"""
Monitor the browser process for unexpected termination.
@ -167,6 +209,7 @@ class ManagedBrowser:
4. If any other error occurs, log the error message.
Note: This method should be called in a separate task to avoid blocking the main event loop.
This is DEPRECATED and should not be used for builtin browsers that need to outlive the Python process.
"""
if self.browser_process:
try:
@ -261,22 +304,33 @@ class ManagedBrowser:
if self.browser_process:
try:
self.browser_process.terminate()
# Wait for process to end gracefully
for _ in range(10): # 10 attempts, 100ms each
if self.browser_process.poll() is not None:
break
await asyncio.sleep(0.1)
# For builtin browsers that should persist, we should check if it's a detached process
# Only terminate if we have proper control over the process
if not self.browser_process.poll():
# Process is still running
self.browser_process.terminate()
# Wait for process to end gracefully
for _ in range(10): # 10 attempts, 100ms each
if self.browser_process.poll() is not None:
break
await asyncio.sleep(0.1)
# Force kill if still running
if self.browser_process.poll() is None:
self.browser_process.kill()
await asyncio.sleep(0.1) # Brief wait for kill to take effect
# Force kill if still running
if self.browser_process.poll() is None:
if sys.platform == "win32":
# On Windows we might need taskkill for detached processes
try:
subprocess.run(["taskkill", "/F", "/PID", str(self.browser_process.pid)])
except Exception:
self.browser_process.kill()
else:
self.browser_process.kill()
await asyncio.sleep(0.1) # Brief wait for kill to take effect
except Exception as e:
self.logger.error(
message="Error terminating browser: {error}",
tag="ERROR",
tag="ERROR",
params={"error": str(e)},
)
@ -379,7 +433,15 @@ class BrowserManager:
sessions (dict): Dictionary to store session information
session_ttl (int): Session timeout in seconds
"""
_playwright_instance = None
@classmethod
async def get_playwright(cls):
from playwright.async_api import async_playwright
if cls._playwright_instance is None:
cls._playwright_instance = await async_playwright().start()
return cls._playwright_instance
def __init__(self, browser_config: BrowserConfig, logger=None):
"""
@ -429,6 +491,7 @@ class BrowserManager:
Note: This method should be called in a separate task to avoid blocking the main event loop.
"""
self.playwright = await self.get_playwright()
if self.playwright is None:
from playwright.async_api import async_playwright

View File

@ -12,7 +12,10 @@ import sys
import datetime
import uuid
import shutil
from typing import List, Dict, Optional, Any
import json
import subprocess
import time
from typing import List, Dict, Optional, Any, Tuple
from colorama import Fore, Style, init
from .async_configs import BrowserConfig
@ -56,6 +59,11 @@ class BrowserProfiler:
# Ensure profiles directory exists
self.profiles_dir = os.path.join(get_home_folder(), "profiles")
os.makedirs(self.profiles_dir, exist_ok=True)
# Builtin browser config file
self.builtin_browser_dir = os.path.join(get_home_folder(), "builtin-browser")
self.builtin_config_file = os.path.join(self.builtin_browser_dir, "browser_config.json")
os.makedirs(self.builtin_browser_dir, exist_ok=True)
async def create_profile(self,
profile_name: Optional[str] = None,
@ -552,7 +560,8 @@ class BrowserProfiler:
browser_type: str = "chromium",
user_data_dir: Optional[str] = None,
debugging_port: int = 9222,
headless: bool = False) -> Optional[str]:
headless: bool = False,
save_as_builtin: bool = False) -> Optional[str]:
"""
Launch a standalone browser with CDP debugging enabled and keep it running
until the user presses 'q'. Returns and displays the CDP URL.
@ -766,4 +775,201 @@ class BrowserProfiler:
# Return the CDP URL
return cdp_url
async def launch_builtin_browser(self,
browser_type: str = "chromium",
debugging_port: int = 9222,
headless: bool = True) -> Optional[str]:
"""
Launch a browser in the background for use as the builtin browser.
Args:
browser_type (str): Type of browser to launch ('chromium' or 'firefox')
debugging_port (int): Port to use for CDP debugging
headless (bool): Whether to run in headless mode
Returns:
str: CDP URL for the browser, or None if launch failed
"""
# Check if there's an existing browser still running
browser_info = self.get_builtin_browser_info()
if browser_info and self._is_browser_running(browser_info.get('pid')):
self.logger.info("Builtin browser is already running", tag="BUILTIN")
return browser_info.get('cdp_url')
# Create a user data directory for the builtin browser
user_data_dir = os.path.join(self.builtin_browser_dir, "user_data")
os.makedirs(user_data_dir, exist_ok=True)
# Create managed browser instance
managed_browser = ManagedBrowser(
browser_type=browser_type,
user_data_dir=user_data_dir,
headless=headless,
logger=self.logger,
debugging_port=debugging_port
)
try:
# Start the browser
await managed_browser.start()
# Check if browser started successfully
browser_process = managed_browser.browser_process
if not browser_process:
self.logger.error("Failed to start browser process.", tag="BUILTIN")
return None
# Get CDP URL
cdp_url = f"http://localhost:{debugging_port}"
# Try to verify browser is responsive by fetching version info
import aiohttp
json_url = f"{cdp_url}/json/version"
config_json = None
try:
async with aiohttp.ClientSession() as session:
for _ in range(10): # Try multiple times
try:
async with session.get(json_url) as response:
if response.status == 200:
config_json = await response.json()
break
except Exception:
pass
await asyncio.sleep(0.5)
except Exception as e:
self.logger.warning(f"Could not verify browser: {str(e)}", tag="BUILTIN")
# Save browser info
browser_info = {
'pid': browser_process.pid,
'cdp_url': cdp_url,
'user_data_dir': user_data_dir,
'browser_type': browser_type,
'debugging_port': debugging_port,
'start_time': time.time(),
'config': config_json
}
with open(self.builtin_config_file, 'w') as f:
json.dump(browser_info, f, indent=2)
# Detach from the browser process - don't keep any references
# This is important to allow the Python script to exit while the browser continues running
# We'll just record the PID and other info, and the browser will run independently
managed_browser.browser_process = None
self.logger.success(f"Builtin browser launched at CDP URL: {cdp_url}", tag="BUILTIN")
return cdp_url
except Exception as e:
self.logger.error(f"Error launching builtin browser: {str(e)}", tag="BUILTIN")
if managed_browser:
await managed_browser.cleanup()
return None
def get_builtin_browser_info(self) -> Optional[Dict[str, Any]]:
"""
Get information about the builtin browser.
Returns:
dict: Browser information or None if no builtin browser is configured
"""
if not os.path.exists(self.builtin_config_file):
return None
try:
with open(self.builtin_config_file, 'r') as f:
browser_info = json.load(f)
# Check if the browser is still running
if not self._is_browser_running(browser_info.get('pid')):
self.logger.warning("Builtin browser is not running", tag="BUILTIN")
return None
return browser_info
except Exception as e:
self.logger.error(f"Error reading builtin browser config: {str(e)}", tag="BUILTIN")
return None
def _is_browser_running(self, pid: Optional[int]) -> bool:
"""Check if a process with the given PID is running"""
if not pid:
return False
try:
# Check if the process exists
if sys.platform == "win32":
process = subprocess.run(["tasklist", "/FI", f"PID eq {pid}"],
capture_output=True, text=True)
return str(pid) in process.stdout
else:
# Unix-like systems
os.kill(pid, 0) # This doesn't actually kill the process, just checks if it exists
return True
except (ProcessLookupError, PermissionError, OSError):
return False
async def kill_builtin_browser(self) -> bool:
"""
Kill the builtin browser if it's running.
Returns:
bool: True if the browser was killed, False otherwise
"""
browser_info = self.get_builtin_browser_info()
if not browser_info:
self.logger.warning("No builtin browser found", tag="BUILTIN")
return False
pid = browser_info.get('pid')
if not pid:
return False
try:
if sys.platform == "win32":
subprocess.run(["taskkill", "/F", "/PID", str(pid)], check=True)
else:
os.kill(pid, signal.SIGTERM)
# Wait for termination
for _ in range(5):
if not self._is_browser_running(pid):
break
await asyncio.sleep(0.5)
else:
# Force kill if still running
os.kill(pid, signal.SIGKILL)
# Remove config file
if os.path.exists(self.builtin_config_file):
os.unlink(self.builtin_config_file)
self.logger.success("Builtin browser terminated", tag="BUILTIN")
return True
except Exception as e:
self.logger.error(f"Error killing builtin browser: {str(e)}", tag="BUILTIN")
return False
async def get_builtin_browser_status(self) -> Dict[str, Any]:
"""
Get status information about the builtin browser.
Returns:
dict: Status information with running, cdp_url, and info fields
"""
browser_info = self.get_builtin_browser_info()
if not browser_info:
return {
'running': False,
'cdp_url': None,
'info': None
}
return {
'running': True,
'cdp_url': browser_info.get('cdp_url'),
'info': browser_info
}

View File

@ -341,6 +341,32 @@ For more documentation visit: https://github.com/unclecode/crawl4ai
crwl profiles # Select "Create new profile" option
# 2. Then use that profile to crawl authenticated content:
crwl https://site-requiring-login.com/dashboard -p my-profile-name
🔄 Builtin Browser Management:
# Start a builtin browser (runs in the background)
crwl browser start
# Check builtin browser status
crwl browser status
# Open a visible window to see the browser
crwl browser view --url https://example.com
# Stop the builtin browser
crwl browser stop
# Restart with different options
crwl browser restart --browser-type chromium --port 9223 --no-headless
# Use the builtin browser in your code
# (Just set browser_mode="builtin" in your BrowserConfig)
browser_config = BrowserConfig(
browser_mode="builtin",
headless=True
)
# Usage via CLI:
crwl https://example.com -b "browser_mode=builtin"
"""
click.echo(examples)
@ -575,6 +601,307 @@ def cli():
pass
@cli.group("browser")
def browser_cmd():
"""Manage browser instances for Crawl4AI
Commands to manage browser instances for Crawl4AI, including:
- status - Check status of the builtin browser
- start - Start a new builtin browser
- stop - Stop the running builtin browser
- restart - Restart the builtin browser
"""
pass
@browser_cmd.command("status")
def browser_status_cmd():
"""Show status of the builtin browser"""
profiler = BrowserProfiler()
try:
status = anyio.run(profiler.get_builtin_browser_status)
if status["running"]:
info = status["info"]
console.print(Panel(
f"[green]Builtin browser is running[/green]\n\n"
f"CDP URL: [cyan]{info['cdp_url']}[/cyan]\n"
f"Process ID: [yellow]{info['pid']}[/yellow]\n"
f"Browser type: [blue]{info['browser_type']}[/blue]\n"
f"User data directory: [magenta]{info['user_data_dir']}[/magenta]\n"
f"Started: [cyan]{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(info['start_time']))}[/cyan]",
title="Builtin Browser Status",
border_style="green"
))
else:
console.print(Panel(
"[yellow]Builtin browser is not running[/yellow]\n\n"
"Use 'crwl browser start' to start a builtin browser",
title="Builtin Browser Status",
border_style="yellow"
))
except Exception as e:
console.print(f"[red]Error checking browser status: {str(e)}[/red]")
sys.exit(1)
@browser_cmd.command("start")
@click.option("--browser-type", "-b", type=click.Choice(["chromium", "firefox"]), default="chromium",
help="Browser type (default: chromium)")
@click.option("--port", "-p", type=int, default=9222, help="Debugging port (default: 9222)")
@click.option("--headless/--no-headless", default=True, help="Run browser in headless mode")
def browser_start_cmd(browser_type: str, port: int, headless: bool):
"""Start a builtin browser instance
This will start a persistent browser instance that can be used by Crawl4AI
by setting browser_mode="builtin" in BrowserConfig.
"""
profiler = BrowserProfiler()
# First check if browser is already running
status = anyio.run(profiler.get_builtin_browser_status)
if status["running"]:
console.print(Panel(
"[yellow]Builtin browser is already running[/yellow]\n\n"
f"CDP URL: [cyan]{status['cdp_url']}[/cyan]\n\n"
"Use 'crwl browser restart' to restart the browser",
title="Builtin Browser Start",
border_style="yellow"
))
return
try:
console.print(Panel(
f"[cyan]Starting builtin browser[/cyan]\n\n"
f"Browser type: [green]{browser_type}[/green]\n"
f"Debugging port: [yellow]{port}[/yellow]\n"
f"Headless: [cyan]{'Yes' if headless else 'No'}[/cyan]",
title="Builtin Browser Start",
border_style="cyan"
))
cdp_url = anyio.run(
profiler.launch_builtin_browser,
browser_type,
port,
headless
)
if cdp_url:
console.print(Panel(
f"[green]Builtin browser started successfully[/green]\n\n"
f"CDP URL: [cyan]{cdp_url}[/cyan]\n\n"
"This browser will be used automatically when setting browser_mode='builtin'",
title="Builtin Browser Start",
border_style="green"
))
else:
console.print(Panel(
"[red]Failed to start builtin browser[/red]",
title="Builtin Browser Start",
border_style="red"
))
sys.exit(1)
except Exception as e:
console.print(f"[red]Error starting builtin browser: {str(e)}[/red]")
sys.exit(1)
@browser_cmd.command("stop")
def browser_stop_cmd():
"""Stop the running builtin browser"""
profiler = BrowserProfiler()
try:
# First check if browser is running
status = anyio.run(profiler.get_builtin_browser_status)
if not status["running"]:
console.print(Panel(
"[yellow]No builtin browser is currently running[/yellow]",
title="Builtin Browser Stop",
border_style="yellow"
))
return
console.print(Panel(
"[cyan]Stopping builtin browser...[/cyan]",
title="Builtin Browser Stop",
border_style="cyan"
))
success = anyio.run(profiler.kill_builtin_browser)
if success:
console.print(Panel(
"[green]Builtin browser stopped successfully[/green]",
title="Builtin Browser Stop",
border_style="green"
))
else:
console.print(Panel(
"[red]Failed to stop builtin browser[/red]",
title="Builtin Browser Stop",
border_style="red"
))
sys.exit(1)
except Exception as e:
console.print(f"[red]Error stopping builtin browser: {str(e)}[/red]")
sys.exit(1)
@browser_cmd.command("view")
@click.option("--url", "-u", help="URL to navigate to (defaults to about:blank)")
def browser_view_cmd(url: Optional[str]):
"""
Open a visible window of the builtin browser
This command connects to the running builtin browser and opens a visible window,
allowing you to see what the browser is currently viewing or navigate to a URL.
"""
profiler = BrowserProfiler()
try:
# First check if browser is running
status = anyio.run(profiler.get_builtin_browser_status)
if not status["running"]:
console.print(Panel(
"[yellow]No builtin browser is currently running[/yellow]\n\n"
"Use 'crwl browser start' to start a builtin browser first",
title="Builtin Browser View",
border_style="yellow"
))
return
info = status["info"]
cdp_url = info["cdp_url"]
console.print(Panel(
f"[cyan]Opening visible window connected to builtin browser[/cyan]\n\n"
f"CDP URL: [green]{cdp_url}[/green]\n"
f"URL to load: [yellow]{url or 'about:blank'}[/yellow]",
title="Builtin Browser View",
border_style="cyan"
))
# Use the CDP URL to launch a new visible window
import subprocess
import os
# Determine the browser command based on platform
if sys.platform == "darwin": # macOS
browser_cmd = ["/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"]
elif sys.platform == "win32": # Windows
browser_cmd = ["C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe"]
else: # Linux
browser_cmd = ["google-chrome"]
# Add arguments
browser_args = [
f"--remote-debugging-port={info['debugging_port']}",
"--remote-debugging-address=localhost",
"--no-first-run",
"--no-default-browser-check"
]
# Add URL if provided
if url:
browser_args.append(url)
# Launch browser
try:
subprocess.Popen(browser_cmd + browser_args)
console.print("[green]Browser window opened. Close it when finished viewing.[/green]")
except Exception as e:
console.print(f"[red]Error launching browser: {str(e)}[/red]")
console.print(f"[yellow]Try connecting manually to {cdp_url} in Chrome or using the '--remote-debugging-port' flag.[/yellow]")
except Exception as e:
console.print(f"[red]Error viewing builtin browser: {str(e)}[/red]")
sys.exit(1)
@browser_cmd.command("restart")
@click.option("--browser-type", "-b", type=click.Choice(["chromium", "firefox"]), default=None,
help="Browser type (defaults to same as current)")
@click.option("--port", "-p", type=int, default=None, help="Debugging port (defaults to same as current)")
@click.option("--headless/--no-headless", default=None, help="Run browser in headless mode")
def browser_restart_cmd(browser_type: Optional[str], port: Optional[int], headless: Optional[bool]):
"""Restart the builtin browser
Stops the current builtin browser if running and starts a new one.
By default, uses the same configuration as the current browser.
"""
profiler = BrowserProfiler()
try:
# First check if browser is running and get its config
status = anyio.run(profiler.get_builtin_browser_status)
current_config = {}
if status["running"]:
info = status["info"]
current_config = {
"browser_type": info["browser_type"],
"port": info["debugging_port"],
"headless": True # Default assumption
}
# Stop the browser
console.print(Panel(
"[cyan]Stopping current builtin browser...[/cyan]",
title="Builtin Browser Restart",
border_style="cyan"
))
success = anyio.run(profiler.kill_builtin_browser)
if not success:
console.print(Panel(
"[red]Failed to stop current browser[/red]",
title="Builtin Browser Restart",
border_style="red"
))
sys.exit(1)
# Use provided options or defaults from current config
browser_type = browser_type or current_config.get("browser_type", "chromium")
port = port or current_config.get("port", 9222)
headless = headless if headless is not None else current_config.get("headless", True)
# Start a new browser
console.print(Panel(
f"[cyan]Starting new builtin browser[/cyan]\n\n"
f"Browser type: [green]{browser_type}[/green]\n"
f"Debugging port: [yellow]{port}[/yellow]\n"
f"Headless: [cyan]{'Yes' if headless else 'No'}[/cyan]",
title="Builtin Browser Restart",
border_style="cyan"
))
cdp_url = anyio.run(
profiler.launch_builtin_browser,
browser_type,
port,
headless
)
if cdp_url:
console.print(Panel(
f"[green]Builtin browser restarted successfully[/green]\n\n"
f"CDP URL: [cyan]{cdp_url}[/cyan]",
title="Builtin Browser Restart",
border_style="green"
))
else:
console.print(Panel(
"[red]Failed to restart builtin browser[/red]",
title="Builtin Browser Restart",
border_style="red"
))
sys.exit(1)
except Exception as e:
console.print(f"[red]Error restarting builtin browser: {str(e)}[/red]")
sys.exit(1)
@cli.command("cdp")
@click.option("--user-data-dir", "-d", help="Directory to use for browser data (will be created if it doesn't exist)")
@click.option("--port", "-P", type=int, default=9222, help="Debugging port (default: 9222)")
@ -834,6 +1161,7 @@ def default(url: str, example: bool, browser_config: str, crawler_config: str, f
crwl profiles - Manage browser profiles for identity-based crawling
crwl crawl - Crawl a website with advanced options
crwl cdp - Launch browser with CDP debugging enabled
crwl browser - Manage builtin browser (start, stop, status, restart)
crwl examples - Show more usage examples
"""

View File

@ -45,7 +45,34 @@ def post_install():
setup_home_directory()
install_playwright()
run_migration()
setup_builtin_browser()
logger.success("Post-installation setup completed!", tag="COMPLETE")
def setup_builtin_browser():
"""Set up a builtin browser for use with Crawl4AI"""
try:
logger.info("Setting up builtin browser...", tag="INIT")
asyncio.run(_setup_builtin_browser())
logger.success("Builtin browser setup completed!", tag="COMPLETE")
except Exception as e:
logger.warning(f"Failed to set up builtin browser: {e}")
logger.warning("You can manually set up a builtin browser using 'crawl4ai-doctor builtin-browser-start'")
async def _setup_builtin_browser():
try:
# Import BrowserProfiler here to avoid circular imports
from .browser_profiler import BrowserProfiler
profiler = BrowserProfiler(logger=logger)
# Launch the builtin browser
cdp_url = await profiler.launch_builtin_browser(headless=True)
if cdp_url:
logger.success(f"Builtin browser launched at {cdp_url}", tag="BROWSER")
else:
logger.warning("Failed to launch builtin browser", tag="BROWSER")
except Exception as e:
logger.warning(f"Error setting up builtin browser: {e}", tag="BROWSER")
raise
def install_playwright():

View File

@ -0,0 +1,123 @@
# Builtin Browser in Crawl4AI
This document explains the builtin browser feature in Crawl4AI and how to use it effectively.
## What is the Builtin Browser?
The builtin browser is a persistent Chrome instance that Crawl4AI manages for you. It runs in the background and can be used by multiple crawling operations, eliminating the need to start and stop browsers for each crawl.
Benefits include:
- **Faster startup times** - The browser is already running, so your scripts start faster
- **Shared resources** - All your crawling scripts can use the same browser instance
- **Simplified management** - No need to worry about CDP URLs or browser processes
- **Persistent cookies and sessions** - Browser state persists between script runs
- **Less resource usage** - Only one browser instance for multiple scripts
## Using the Builtin Browser
### In Python Code
Using the builtin browser in your code is simple:
```python
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
# Create browser config with builtin mode
browser_config = BrowserConfig(
browser_mode="builtin", # This is the key setting!
headless=True # Can be headless or not
)
# Create the crawler
crawler = AsyncWebCrawler(config=browser_config)
# Use it - no need to explicitly start()
result = await crawler.arun("https://example.com")
```
Key points:
1. Set `browser_mode="builtin"` in your BrowserConfig
2. No need for explicit `start()` call - the crawler will automatically connect to the builtin browser
3. No need to use a context manager or call `close()` - the browser stays running
### Via CLI
The CLI provides commands to manage the builtin browser:
```bash
# Start the builtin browser
crwl browser start
# Check its status
crwl browser status
# Open a visible window to see what the browser is doing
crwl browser view --url https://example.com
# Stop it when no longer needed
crwl browser stop
# Restart with different settings
crwl browser restart --no-headless
```
When crawling via CLI, simply add the builtin browser mode:
```bash
crwl https://example.com -b "browser_mode=builtin"
```
## How It Works
1. When a crawler with `browser_mode="builtin"` is created:
- It checks if a builtin browser is already running
- If not, it automatically launches one
- It connects to the browser via CDP (Chrome DevTools Protocol)
2. The browser process continues running after your script exits
- This means it's ready for the next crawl
- You can manage it via the CLI commands
3. During installation, Crawl4AI attempts to create a builtin browser automatically
## Example
See the [builtin_browser_example.py](builtin_browser_example.py) file for a complete example.
Run it with:
```bash
python builtin_browser_example.py
```
## When to Use
The builtin browser is ideal for:
- Scripts that run frequently
- Development and testing workflows
- Applications that need to minimize startup time
- Systems where you want to manage browser instances centrally
You might not want to use it when:
- Running one-off scripts
- When you need different browser configurations for different tasks
- In environments where persistent processes are not allowed
## Troubleshooting
If you encounter issues:
1. Check the browser status:
```
crwl browser status
```
2. Try restarting it:
```
crwl browser restart
```
3. If problems persist, stop it and let Crawl4AI start a fresh one:
```
crwl browser stop
```

View File

@ -0,0 +1,86 @@
#!/usr/bin/env python3
"""
Builtin Browser Example
This example demonstrates how to use Crawl4AI's builtin browser feature,
which simplifies the browser management process. With builtin mode:
- No need to manually start or connect to a browser
- No need to manage CDP URLs or browser processes
- Automatically connects to an existing browser or launches one if needed
- Browser persists between script runs, reducing startup time
- No explicit cleanup or close() calls needed
The example also demonstrates "auto-starting" where you don't need to explicitly
call start() method on the crawler.
"""
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
import time
async def crawl_with_builtin_browser():
"""
Simple example of crawling with the builtin browser.
Key features:
1. browser_mode="builtin" in BrowserConfig
2. No explicit start() call needed
3. No explicit close() needed
"""
print("\n=== Crawl4AI Builtin Browser Example ===\n")
# Create a browser configuration with builtin mode
browser_config = BrowserConfig(
browser_mode="builtin", # This is the key setting!
headless=True # Can run headless for background operation
)
# Create crawler run configuration
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS, # Skip cache for this demo
screenshot=True, # Take a screenshot
verbose=True # Show verbose logging
)
# Create the crawler instance
# Note: We don't need to use "async with" context manager
crawler = AsyncWebCrawler(config=browser_config)
# Start crawling several URLs - no explicit start() needed!
# The crawler will automatically connect to the builtin browser
print("\n➡️ Crawling first URL...")
t0 = time.time()
result1 = await crawler.arun(
url="https://crawl4ai.com",
config=crawler_config
)
t1 = time.time()
print(f"✅ First URL crawled in {t1-t0:.2f} seconds")
print(f" Got {len(result1.markdown.raw_markdown)} characters of content")
print(f" Title: {result1.metadata.get('title', 'No title')}")
# Try another URL - the browser is already running, so this should be faster
print("\n➡️ Crawling second URL...")
t0 = time.time()
result2 = await crawler.arun(
url="https://example.com",
config=crawler_config
)
t1 = time.time()
print(f"✅ Second URL crawled in {t1-t0:.2f} seconds")
print(f" Got {len(result2.markdown.raw_markdown)} characters of content")
print(f" Title: {result2.metadata.get('title', 'No title')}")
# The builtin browser continues running in the background
# No need to explicitly close it
print("\n🔄 The builtin browser remains running for future use")
print(" You can use 'crwl browser status' to check its status")
print(" or 'crwl browser stop' to stop it when completely done")
async def main():
"""Run the example"""
await crawl_with_builtin_browser()
if __name__ == "__main__":
asyncio.run(main())

View File

@ -0,0 +1,300 @@
"""
Test script for browser_profiler and builtin browser functionality.
This script tests:
1. Creating a builtin browser
2. Getting browser information
3. Killing the browser
4. Restarting the browser
5. Testing crawling with different browser modes
6. Testing edge cases
"""
import asyncio
import os
import sys
import time
from colorama import Fore, init
# Add the project root to the path for imports
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
from crawl4ai.browser_profiler import BrowserProfiler
from crawl4ai.async_webcrawler import AsyncWebCrawler
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
from crawl4ai.async_logger import AsyncLogger
# Initialize colorama for cross-platform colored terminal output
init()
# Define colors for pretty output
SUCCESS = Fore.GREEN
WARNING = Fore.YELLOW
ERROR = Fore.RED
INFO = Fore.CYAN
RESET = Fore.RESET
# Create logger
logger = AsyncLogger(verbose=True)
async def test_browser_profiler():
"""Test the BrowserProfiler class functionality"""
print(f"\n{INFO}========== Testing BrowserProfiler =========={RESET}")
# Initialize browser profiler
profiler = BrowserProfiler(logger=logger)
# Step 1: Check if builtin browser exists and kill it if it does
print(f"\n{INFO}1. Checking if builtin browser exists{RESET}")
browser_info = profiler.get_builtin_browser_info()
if browser_info:
print(f"{SUCCESS}Builtin browser found: {browser_info['cdp_url']}{RESET}")
# Kill it to start with a clean state
print(f"{INFO}Killing existing browser...{RESET}")
await profiler.kill_builtin_browser()
browser_info = profiler.get_builtin_browser_info()
if not browser_info:
print(f"{SUCCESS}Browser successfully killed{RESET}")
else:
print(f"{ERROR}Failed to kill browser{RESET}")
else:
print(f"{WARNING}No builtin browser found{RESET}")
# Step 2: Launch a new builtin browser
print(f"\n{INFO}2. Launching new builtin browser{RESET}")
cdp_url = await profiler.launch_builtin_browser(headless=True)
if cdp_url:
print(f"{SUCCESS}Builtin browser launched at: {cdp_url}{RESET}")
else:
print(f"{ERROR}Failed to launch builtin browser{RESET}")
return
# Step 3: Get and display browser information
print(f"\n{INFO}3. Getting browser information{RESET}")
browser_info = profiler.get_builtin_browser_info()
if browser_info:
print(f"{SUCCESS}Browser info retrieved:{RESET}")
for key, value in browser_info.items():
if key != 'config': # Skip the verbose config section
print(f" {key}: {value}")
else:
print(f"{ERROR}Failed to get browser information{RESET}")
# Step 4: Get browser status
print(f"\n{INFO}4. Getting browser status{RESET}")
status = await profiler.get_builtin_browser_status()
print(f"Running: {status['running']}")
print(f"CDP URL: {status['cdp_url']}")
# Pause to let the browser run for a moment
print(f"\n{INFO}Waiting for 2 seconds...{RESET}")
await asyncio.sleep(2)
return cdp_url # Return the CDP URL for the crawling tests
async def test_crawling_with_builtin_browser(cdp_url):
"""Test crawling with the builtin browser"""
print(f"\n{INFO}========== Testing Crawling with Builtin Browser =========={RESET}")
# Step 1: Create a crawler with 'builtin' browser mode
print(f"\n{INFO}1. Creating crawler with 'builtin' browser mode{RESET}")
browser_config = BrowserConfig(
browser_mode="builtin",
headless=True
)
crawler = AsyncWebCrawler(config=browser_config)
# Step 2: Test crawling without explicitly starting (should auto-start)
print(f"\n{INFO}2. Testing auto-start with arun{RESET}")
try:
result = await crawler.arun("https://crawl4ai.com")
print(f"{SUCCESS}Auto-start crawling successful!{RESET}")
print(f" Got {len(result.markdown.raw_markdown)} chars of markdown content")
except Exception as e:
print(f"{ERROR}Auto-start crawling failed: {str(e)}{RESET}")
# Close the crawler
await crawler.close()
# Step 3: Test with explicit start
print(f"\n{INFO}3. Testing with explicit start{RESET}")
crawler = AsyncWebCrawler(config=browser_config)
try:
await crawler.start()
print(f"{SUCCESS}Explicit start successful!{RESET}")
result = await crawler.arun("https://example.com")
print(f" Got {len(result.markdown.raw_markdown)} chars of markdown content")
# Try second time, no start needed
print(f"{INFO}Testing second arun call without start{RESET}")
result = await crawler.arun("https://example.com")
print(f" Got {len(result.markdown.raw_markdown)} chars of markdown content")
except Exception as e:
print(f"{ERROR}Explicit start crawling failed: {str(e)}{RESET}")
# Close the crawler
await crawler.close()
# Step 4: Test with context manager
print(f"\n{INFO}4. Testing with context manager{RESET}")
try:
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun("https://httpbin.org/html")
print(f"{SUCCESS}Context manager crawling successful!{RESET}")
print(f" Got {len(result.markdown.raw_markdown)} chars of markdown content")
except Exception as e:
print(f"{ERROR}Context manager crawling failed: {str(e)}{RESET}")
return True
async def test_crawling_without_builtin_browser():
"""Test crawling after killing the builtin browser"""
print(f"\n{INFO}========== Testing Crawling Without Builtin Browser =========={RESET}")
# Step 1: Kill the builtin browser
print(f"\n{INFO}1. Killing the builtin browser{RESET}")
profiler = BrowserProfiler(logger=logger)
await profiler.kill_builtin_browser()
# Step 2: Create a crawler with 'builtin' mode (should fall back to dedicated)
print(f"\n{INFO}2. Creating crawler with 'builtin' mode (should fall back){RESET}")
browser_config = BrowserConfig(
browser_mode="builtin",
headless=True
)
try:
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun("https://httpbin.org/get")
print(f"{SUCCESS}Fallback to dedicated browser successful!{RESET}")
print(f" Got {len(result.markdown.raw_markdown)} chars of markdown content")
except Exception as e:
print(f"{ERROR}Fallback crawler failed: {str(e)}{RESET}")
# Step 3: Test with direct CDP URL
print(f"\n{INFO}3. Testing with direct CDP URL connection{RESET}")
# Launch a standalone browser to get a CDP URL
print(f"{INFO}Launching standalone browser...{RESET}")
cdp_url = await profiler.launch_standalone_browser(headless=True)
if not cdp_url:
print(f"{ERROR}Failed to launch standalone browser{RESET}")
return
print(f"{SUCCESS}Got CDP URL: {cdp_url}{RESET}")
# Create a crawler with the CDP URL
browser_config = BrowserConfig(
browser_mode="dedicated",
cdp_url=cdp_url,
use_managed_browser=True,
headless=True
)
try:
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun("https://httpbin.org/ip")
print(f"{SUCCESS}Direct CDP URL crawling successful!{RESET}")
print(f" Got {len(result.markdown.raw_markdown)} chars of markdown content")
except Exception as e:
print(f"{ERROR}Direct CDP URL crawling failed: {str(e)}{RESET}")
return True
async def test_edge_cases():
"""Test edge cases like multiple starts, killing browser during crawl, etc."""
print(f"\n{INFO}========== Testing Edge Cases =========={RESET}")
# Step 1: Launch the builtin browser if it doesn't exist
print(f"\n{INFO}1. Ensuring builtin browser exists{RESET}")
profiler = BrowserProfiler(logger=logger)
browser_info = profiler.get_builtin_browser_info()
if not browser_info:
cdp_url = await profiler.launch_builtin_browser(headless=True)
if cdp_url:
print(f"{SUCCESS}Builtin browser launched at: {cdp_url}{RESET}")
else:
print(f"{ERROR}Failed to launch builtin browser{RESET}")
return
else:
print(f"{SUCCESS}Using existing builtin browser: {browser_info['cdp_url']}{RESET}")
# Step 2: Test multiple starts with the same crawler
print(f"\n{INFO}2. Testing multiple starts with the same crawler{RESET}")
browser_config = BrowserConfig(browser_mode="builtin", headless=True)
crawler = AsyncWebCrawler(config=browser_config)
await crawler.start()
print(f"{SUCCESS}First start successful!{RESET}")
try:
await crawler.start()
print(f"{SUCCESS}Second start didn't cause errors!{RESET}")
except Exception as e:
print(f"{ERROR}Second start failed: {str(e)}{RESET}")
# Run a crawl to verify functionality
try:
result = await crawler.arun("https://httpbin.org/user-agent")
print(f"{SUCCESS}Crawling after multiple starts successful!{RESET}")
print(f" Got {len(result.markdown.raw_markdown)} chars of markdown content")
except Exception as e:
print(f"{ERROR}Crawling after multiple starts failed: {str(e)}{RESET}")
await crawler.close()
# Step 3: Test killing browser while crawler is active
print(f"\n{INFO}3. Testing killing browser while crawler is active{RESET}")
# Create and start a crawler
browser_config = BrowserConfig(browser_mode="builtin", headless=True)
crawler = AsyncWebCrawler(config=browser_config)
await crawler.start()
# Kill the browser
print(f"{INFO}Killing the browser...{RESET}")
await profiler.kill_builtin_browser()
# Try to crawl (should fail)
try:
result = await crawler.arun("https://httpbin.org/get")
print(f"{WARNING}Crawling succeeded despite killed browser!{RESET}")
except Exception as e:
print(f"{SUCCESS}Crawling failed as expected: {str(e)}{RESET}")
await crawler.close()
return True
async def main():
"""Run all tests"""
try:
print(f"{INFO}Starting browser_profiler and builtin browser tests{RESET}")
# Run browser profiler tests
cdp_url = await test_browser_profiler()
# Run crawling tests with builtin browser
if cdp_url:
await test_crawling_with_builtin_browser(cdp_url)
# Run tests without builtin browser
# await test_crawling_without_builtin_browser()
# Run edge case tests
# await test_edge_cases()
print(f"\n{SUCCESS}All tests completed!{RESET}")
except Exception as e:
print(f"\n{ERROR}Test failed with error: {str(e)}{RESET}")
import traceback
traceback.print_exc()
finally:
# Clean up: kill any remaining builtin browser
print(f"\n{INFO}Cleaning up: killing any remaining builtin browser{RESET}")
profiler = BrowserProfiler(logger=logger)
await profiler.kill_builtin_browser()
print(f"{SUCCESS}Test cleanup complete{RESET}")
if __name__ == "__main__":
asyncio.run(main())