feat(browser): add BrowserProfiler class for identity-based browsing

Adds a new BrowserProfiler class that provides comprehensive management of browser profiles for identity-based crawling. Features include:
- Interactive profile creation and management
- Profile listing, retrieval, and deletion
- Guided console interface
- Migration of profile management from ManagedBrowser
- New example script for identity-based browsing

ALSO:
- Updates logging format in AsyncWebCrawler
- Removes content filter from hello_world example
- Relaxes httpx version constraint

BREAKING CHANGE: Profile management methods from ManagedBrowser are now deprecated and delegate to BrowserProfiler
This commit is contained in:
UncleCode 2025-03-02 20:32:29 +08:00
parent a9e24307cc
commit cba4a466e5
9 changed files with 844 additions and 14 deletions

View File

@ -42,6 +42,7 @@ from .async_dispatcher import (
)
from .docker_client import Crawl4aiDockerClient
from .hub import CrawlerHub
from .browser_profiler import BrowserProfiler
from .deep_crawling import (
DeepCrawlStrategy,
BFSDeepCrawlStrategy,
@ -66,6 +67,7 @@ __all__ = [
"AsyncLoggerBase",
"AsyncLogger",
"AsyncWebCrawler",
"BrowserProfiler",
"DeepCrawlStrategy",
"BFSDeepCrawlStrategy",
"BestFirstCrawlingStrategy",

View File

@ -584,9 +584,9 @@ class AsyncWebCrawler:
# Log processing completion
self.logger.info(
message="Processed {url:.50}... | Time: {timing}ms",
message="{url:.50}... | Time: {timing}s",
tag="SCRAPE",
params={"url": _url, "timing": int((time.perf_counter() - t1) * 1000)},
params={"url": _url, "timing": int((time.perf_counter() - t1) * 1000) / 1000},
)
################################

View File

@ -74,6 +74,7 @@ class ManagedBrowser:
_get_browser_args(): Returns browser-specific command line arguments.
_get_user_data_dir(): Returns the user data directory path.
_cleanup(): Terminates the browser process and removes the temporary directory.
create_profile(): Static method to create a user profile by launching a browser for user interaction.
"""
browser_type: str
@ -288,6 +289,80 @@ class ManagedBrowser:
tag="ERROR",
params={"error": str(e)},
)
# These methods have been moved to BrowserProfiler class
@staticmethod
async def create_profile(browser_config=None, profile_name=None, logger=None):
"""
This method has been moved to the BrowserProfiler class.
Creates a browser profile by launching a browser for interactive user setup
and waits until the user closes it. The profile is stored in a directory that
can be used later with BrowserConfig.user_data_dir.
Please use BrowserProfiler.create_profile() instead.
Example:
```python
from crawl4ai.browser_profiler import BrowserProfiler
profiler = BrowserProfiler()
profile_path = await profiler.create_profile(profile_name="my-login-profile")
```
"""
from .browser_profiler import BrowserProfiler
# Create a BrowserProfiler instance and delegate to it
profiler = BrowserProfiler(logger=logger)
return await profiler.create_profile(profile_name=profile_name, browser_config=browser_config)
@staticmethod
def list_profiles():
"""
This method has been moved to the BrowserProfiler class.
Lists all available browser profiles in the Crawl4AI profiles directory.
Please use BrowserProfiler.list_profiles() instead.
Example:
```python
from crawl4ai.browser_profiler import BrowserProfiler
profiler = BrowserProfiler()
profiles = profiler.list_profiles()
```
"""
from .browser_profiler import BrowserProfiler
# Create a BrowserProfiler instance and delegate to it
profiler = BrowserProfiler()
return profiler.list_profiles()
@staticmethod
def delete_profile(profile_name_or_path):
"""
This method has been moved to the BrowserProfiler class.
Delete a browser profile by name or path.
Please use BrowserProfiler.delete_profile() instead.
Example:
```python
from crawl4ai.browser_profiler import BrowserProfiler
profiler = BrowserProfiler()
success = profiler.delete_profile("my-profile")
```
"""
from .browser_profiler import BrowserProfiler
# Create a BrowserProfiler instance and delegate to it
profiler = BrowserProfiler()
return profiler.delete_profile(profile_name_or_path)
class BrowserManager:
@ -304,6 +379,7 @@ class BrowserManager:
sessions (dict): Dictionary to store session information
session_ttl (int): Session timeout in seconds
"""
def __init__(self, browser_config: BrowserConfig, logger=None):
"""

View File

@ -0,0 +1,544 @@
"""
Browser Profiler Module
This module provides a dedicated class for managing browser profiles
that can be used for identity-based crawling with Crawl4AI.
"""
import os
import asyncio
import signal
import sys
import datetime
import uuid
import shutil
from typing import List, Dict, Optional, Any
from colorama import Fore, Style, init
from .async_configs import BrowserConfig
from .browser_manager import ManagedBrowser
from .async_logger import AsyncLogger, AsyncLoggerBase
from .utils import get_home_folder
class BrowserProfiler:
"""
A dedicated class for managing browser profiles for Crawl4AI.
The BrowserProfiler allows you to:
- Create browser profiles interactively
- List available profiles
- Delete profiles when no longer needed
- Get profile paths for use in BrowserConfig
Profiles are stored by default in ~/.crawl4ai/profiles/
"""
def __init__(self, logger: Optional[AsyncLoggerBase] = None):
"""
Initialize the BrowserProfiler.
Args:
logger (AsyncLoggerBase, optional): Logger for outputting messages.
If None, a default AsyncLogger will be created.
"""
# Initialize colorama for colorful terminal output
init()
# Create a logger if not provided
if logger is None:
self.logger = AsyncLogger(verbose=True)
elif not isinstance(logger, AsyncLoggerBase):
self.logger = AsyncLogger(verbose=True)
else:
self.logger = logger
# Ensure profiles directory exists
self.profiles_dir = os.path.join(get_home_folder(), "profiles")
os.makedirs(self.profiles_dir, exist_ok=True)
async def create_profile(self,
profile_name: Optional[str] = None,
browser_config: Optional[BrowserConfig] = None) -> Optional[str]:
"""
Creates a browser profile by launching a browser for interactive user setup
and waits until the user closes it. The profile is stored in a directory that
can be used later with BrowserConfig.user_data_dir.
Args:
profile_name (str, optional): Name for the profile directory.
If None, a name is generated based on timestamp.
browser_config (BrowserConfig, optional): Configuration for the browser.
If None, a default configuration is used with headless=False.
Returns:
str: Path to the created profile directory, or None if creation failed
Example:
```python
profiler = BrowserProfiler()
# Create a profile interactively
profile_path = await profiler.create_profile(
profile_name="my-login-profile"
)
# Use the profile in a crawler
browser_config = BrowserConfig(
headless=True,
use_managed_browser=True,
user_data_dir=profile_path
)
async with AsyncWebCrawler(config=browser_config) as crawler:
# The crawler will now use your profile with all your cookies and login state
result = await crawler.arun("https://example.com/dashboard")
```
"""
# Create default browser config if none provided
if browser_config is None:
from .async_configs import BrowserConfig
browser_config = BrowserConfig(
browser_type="chromium",
headless=False, # Must be visible for user interaction
verbose=True
)
else:
# Ensure headless is False for user interaction
browser_config.headless = False
# Generate profile name if not provided
if not profile_name:
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
profile_name = f"profile_{timestamp}_{uuid.uuid4().hex[:6]}"
# Sanitize profile name (replace spaces and special chars)
profile_name = "".join(c if c.isalnum() or c in "-_" else "_" for c in profile_name)
# Set user data directory
profile_path = os.path.join(self.profiles_dir, profile_name)
os.makedirs(profile_path, exist_ok=True)
# Print instructions for the user with colorama formatting
border = f"{Fore.CYAN}{'='*80}{Style.RESET_ALL}"
self.logger.info(f"\n{border}", tag="PROFILE")
self.logger.info(f"Creating browser profile: {Fore.GREEN}{profile_name}{Style.RESET_ALL}", tag="PROFILE")
self.logger.info(f"Profile directory: {Fore.YELLOW}{profile_path}{Style.RESET_ALL}", tag="PROFILE")
self.logger.info("\nInstructions:", tag="PROFILE")
self.logger.info("1. A browser window will open for you to set up your profile.", tag="PROFILE")
self.logger.info(f"2. {Fore.CYAN}Log in to websites{Style.RESET_ALL}, configure settings, etc. as needed.", tag="PROFILE")
self.logger.info(f"3. When you're done, {Fore.YELLOW}press 'q' in this terminal{Style.RESET_ALL} to close the browser.", tag="PROFILE")
self.logger.info("4. The profile will be saved and ready to use with Crawl4AI.", tag="PROFILE")
self.logger.info(f"{border}\n", tag="PROFILE")
# Create managed browser instance
managed_browser = ManagedBrowser(
browser_type=browser_config.browser_type,
user_data_dir=profile_path,
headless=False, # Must be visible
logger=self.logger,
debugging_port=browser_config.debugging_port
)
# Set up signal handlers to ensure cleanup on interrupt
original_sigint = signal.getsignal(signal.SIGINT)
original_sigterm = signal.getsignal(signal.SIGTERM)
# Define cleanup handler for signals
async def cleanup_handler(sig, frame):
self.logger.warning("\nCleaning up browser process...", tag="PROFILE")
await managed_browser.cleanup()
# Restore original signal handlers
signal.signal(signal.SIGINT, original_sigint)
signal.signal(signal.SIGTERM, original_sigterm)
if sig == signal.SIGINT:
self.logger.error("Profile creation interrupted. Profile may be incomplete.", tag="PROFILE")
sys.exit(1)
# Set signal handlers
def sigint_handler(sig, frame):
asyncio.create_task(cleanup_handler(sig, frame))
signal.signal(signal.SIGINT, sigint_handler)
signal.signal(signal.SIGTERM, sigint_handler)
# Event to signal when user is done with the browser
user_done_event = asyncio.Event()
# Run keyboard input loop in a separate task
async def listen_for_quit_command():
import termios
import tty
import select
# First output the prompt
self.logger.info(f"{Fore.CYAN}Press '{Fore.WHITE}q{Fore.CYAN}' when you've finished using the browser...{Style.RESET_ALL}", tag="PROFILE")
# Save original terminal settings
fd = sys.stdin.fileno()
old_settings = termios.tcgetattr(fd)
try:
# Switch to non-canonical mode (no line buffering)
tty.setcbreak(fd)
while True:
# Check if input is available (non-blocking)
readable, _, _ = select.select([sys.stdin], [], [], 0.5)
if readable:
key = sys.stdin.read(1)
if key.lower() == 'q':
self.logger.info(f"{Fore.GREEN}Closing browser and saving profile...{Style.RESET_ALL}", tag="PROFILE")
user_done_event.set()
return
# Check if the browser process has already exited
if managed_browser.browser_process and managed_browser.browser_process.poll() is not None:
self.logger.info("Browser already closed. Ending input listener.", tag="PROFILE")
user_done_event.set()
return
await asyncio.sleep(0.1)
finally:
# Restore terminal settings
termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
try:
# Start the browser
await managed_browser.start()
# Check if browser started successfully
browser_process = managed_browser.browser_process
if not browser_process:
self.logger.error("Failed to start browser process.", tag="PROFILE")
return None
self.logger.info(f"Browser launched. {Fore.CYAN}Waiting for you to finish...{Style.RESET_ALL}", tag="PROFILE")
# Start listening for keyboard input
listener_task = asyncio.create_task(listen_for_quit_command())
# Wait for either the user to press 'q' or for the browser process to exit naturally
while not user_done_event.is_set() and browser_process.poll() is None:
await asyncio.sleep(0.5)
# Cancel the listener task if it's still running
if not listener_task.done():
listener_task.cancel()
try:
await listener_task
except asyncio.CancelledError:
pass
# If the browser is still running and the user pressed 'q', terminate it
if browser_process.poll() is None and user_done_event.is_set():
self.logger.info("Terminating browser process...", tag="PROFILE")
await managed_browser.cleanup()
self.logger.success(f"Browser closed. Profile saved at: {Fore.GREEN}{profile_path}{Style.RESET_ALL}", tag="PROFILE")
except Exception as e:
self.logger.error(f"Error creating profile: {str(e)}", tag="PROFILE")
await managed_browser.cleanup()
return None
finally:
# Restore original signal handlers
signal.signal(signal.SIGINT, original_sigint)
signal.signal(signal.SIGTERM, original_sigterm)
# Make sure browser is fully cleaned up
await managed_browser.cleanup()
# Return the profile path
return profile_path
def list_profiles(self) -> List[Dict[str, Any]]:
"""
Lists all available browser profiles in the Crawl4AI profiles directory.
Returns:
list: A list of dictionaries containing profile information:
[{"name": "profile_name", "path": "/path/to/profile", "created": datetime, "type": "chromium|firefox"}]
Example:
```python
profiler = BrowserProfiler()
# List all available profiles
profiles = profiler.list_profiles()
for profile in profiles:
print(f"Profile: {profile['name']}")
print(f" Path: {profile['path']}")
print(f" Created: {profile['created']}")
print(f" Browser type: {profile['type']}")
```
"""
if not os.path.exists(self.profiles_dir):
return []
profiles = []
for name in os.listdir(self.profiles_dir):
profile_path = os.path.join(self.profiles_dir, name)
# Skip if not a directory
if not os.path.isdir(profile_path):
continue
# Check if this looks like a valid browser profile
# For Chromium: Look for Preferences file
# For Firefox: Look for prefs.js file
is_valid = False
if os.path.exists(os.path.join(profile_path, "Preferences")) or \
os.path.exists(os.path.join(profile_path, "Default", "Preferences")):
is_valid = "chromium"
elif os.path.exists(os.path.join(profile_path, "prefs.js")):
is_valid = "firefox"
if is_valid:
# Get creation time
created = datetime.datetime.fromtimestamp(
os.path.getctime(profile_path)
)
profiles.append({
"name": name,
"path": profile_path,
"created": created,
"type": is_valid
})
# Sort by creation time, newest first
profiles.sort(key=lambda x: x["created"], reverse=True)
return profiles
def get_profile_path(self, profile_name: str) -> Optional[str]:
"""
Get the full path to a profile by name.
Args:
profile_name (str): Name of the profile (not the full path)
Returns:
str: Full path to the profile directory, or None if not found
Example:
```python
profiler = BrowserProfiler()
path = profiler.get_profile_path("my-profile")
if path:
print(f"Profile path: {path}")
else:
print("Profile not found")
```
"""
profile_path = os.path.join(self.profiles_dir, profile_name)
# Check if path exists and is a valid profile
if not os.path.isdir(profile_path):
return None
# Look for profile indicators
is_profile = (
os.path.exists(os.path.join(profile_path, "Preferences")) or
os.path.exists(os.path.join(profile_path, "Default", "Preferences")) or
os.path.exists(os.path.join(profile_path, "prefs.js"))
)
if not is_profile:
return None # Not a valid browser profile
return profile_path
def delete_profile(self, profile_name_or_path: str) -> bool:
"""
Delete a browser profile by name or path.
Args:
profile_name_or_path (str): Name of the profile or full path to profile directory
Returns:
bool: True if the profile was deleted successfully, False otherwise
Example:
```python
profiler = BrowserProfiler()
# Delete by name
success = profiler.delete_profile("my-profile")
# Delete by path
success = profiler.delete_profile("/path/to/.crawl4ai/profiles/my-profile")
```
"""
# Determine if input is a name or a path
if os.path.isabs(profile_name_or_path):
# Full path provided
profile_path = profile_name_or_path
else:
# Just a name provided, construct path
profile_path = os.path.join(self.profiles_dir, profile_name_or_path)
# Check if path exists and is a valid profile
if not os.path.isdir(profile_path):
return False
# Look for profile indicators
is_profile = (
os.path.exists(os.path.join(profile_path, "Preferences")) or
os.path.exists(os.path.join(profile_path, "Default", "Preferences")) or
os.path.exists(os.path.join(profile_path, "prefs.js"))
)
if not is_profile:
return False # Not a valid browser profile
# Delete the profile directory
try:
shutil.rmtree(profile_path)
return True
except Exception:
return False
async def interactive_manager(self, crawl_callback=None):
"""
Launch an interactive profile management console.
Args:
crawl_callback (callable, optional): Function to call when selecting option to use
a profile for crawling. It will be called with (profile_path, url).
Example:
```python
profiler = BrowserProfiler()
# Define a custom crawl function
async def my_crawl_function(profile_path, url):
print(f"Crawling {url} with profile {profile_path}")
# Implement your crawling logic here
# Start interactive manager
await profiler.interactive_manager(crawl_callback=my_crawl_function)
```
"""
while True:
self.logger.info(f"\n{Fore.CYAN}Profile Management Options:{Style.RESET_ALL}", tag="MENU")
self.logger.info(f"1. {Fore.GREEN}Create a new profile{Style.RESET_ALL}", tag="MENU")
self.logger.info(f"2. {Fore.YELLOW}List available profiles{Style.RESET_ALL}", tag="MENU")
self.logger.info(f"3. {Fore.RED}Delete a profile{Style.RESET_ALL}", tag="MENU")
# Only show crawl option if callback provided
if crawl_callback:
self.logger.info(f"4. {Fore.CYAN}Use a profile to crawl a website{Style.RESET_ALL}", tag="MENU")
self.logger.info(f"5. {Fore.MAGENTA}Exit{Style.RESET_ALL}", tag="MENU")
exit_option = "5"
else:
self.logger.info(f"4. {Fore.MAGENTA}Exit{Style.RESET_ALL}", tag="MENU")
exit_option = "4"
choice = input(f"\n{Fore.CYAN}Enter your choice (1-{exit_option}): {Style.RESET_ALL}")
if choice == "1":
# Create new profile
name = input(f"{Fore.GREEN}Enter a name for the new profile (or press Enter for auto-generated name): {Style.RESET_ALL}")
await self.create_profile(name or None)
elif choice == "2":
# List profiles
profiles = self.list_profiles()
if not profiles:
self.logger.warning(" No profiles found. Create one first with option 1.", tag="PROFILES")
continue
# Print profile information with colorama formatting
self.logger.info("\nAvailable profiles:", tag="PROFILES")
for i, profile in enumerate(profiles):
self.logger.info(f"[{i+1}] {Fore.CYAN}{profile['name']}{Style.RESET_ALL}", tag="PROFILES")
self.logger.info(f" Path: {Fore.YELLOW}{profile['path']}{Style.RESET_ALL}", tag="PROFILES")
self.logger.info(f" Created: {profile['created'].strftime('%Y-%m-%d %H:%M:%S')}", tag="PROFILES")
self.logger.info(f" Browser type: {profile['type']}", tag="PROFILES")
self.logger.info("", tag="PROFILES") # Empty line for spacing
elif choice == "3":
# Delete profile
profiles = self.list_profiles()
if not profiles:
self.logger.warning("No profiles found to delete", tag="PROFILES")
continue
# Display numbered list
self.logger.info(f"\n{Fore.YELLOW}Available profiles:{Style.RESET_ALL}", tag="PROFILES")
for i, profile in enumerate(profiles):
self.logger.info(f"[{i+1}] {profile['name']}", tag="PROFILES")
# Get profile to delete
profile_idx = input(f"{Fore.RED}Enter the number of the profile to delete (or 'c' to cancel): {Style.RESET_ALL}")
if profile_idx.lower() == 'c':
continue
try:
idx = int(profile_idx) - 1
if 0 <= idx < len(profiles):
profile_name = profiles[idx]["name"]
self.logger.info(f"Deleting profile: {Fore.YELLOW}{profile_name}{Style.RESET_ALL}", tag="PROFILES")
# Confirm deletion
confirm = input(f"{Fore.RED}Are you sure you want to delete this profile? (y/n): {Style.RESET_ALL}")
if confirm.lower() == 'y':
success = self.delete_profile(profiles[idx]["path"])
if success:
self.logger.success(f"Profile {Fore.GREEN}{profile_name}{Style.RESET_ALL} deleted successfully", tag="PROFILES")
else:
self.logger.error(f"Failed to delete profile {Fore.RED}{profile_name}{Style.RESET_ALL}", tag="PROFILES")
else:
self.logger.error("Invalid profile number", tag="PROFILES")
except ValueError:
self.logger.error("Please enter a valid number", tag="PROFILES")
elif choice == "4" and crawl_callback:
# Use profile to crawl a site
profiles = self.list_profiles()
if not profiles:
self.logger.warning("No profiles found. Create one first.", tag="PROFILES")
continue
# Display numbered list
self.logger.info(f"\n{Fore.YELLOW}Available profiles:{Style.RESET_ALL}", tag="PROFILES")
for i, profile in enumerate(profiles):
self.logger.info(f"[{i+1}] {profile['name']}", tag="PROFILES")
# Get profile to use
profile_idx = input(f"{Fore.CYAN}Enter the number of the profile to use (or 'c' to cancel): {Style.RESET_ALL}")
if profile_idx.lower() == 'c':
continue
try:
idx = int(profile_idx) - 1
if 0 <= idx < len(profiles):
profile_path = profiles[idx]["path"]
url = input(f"{Fore.CYAN}Enter the URL to crawl: {Style.RESET_ALL}")
if url:
# Call the provided crawl callback
await crawl_callback(profile_path, url)
else:
self.logger.error("No URL provided", tag="CRAWL")
else:
self.logger.error("Invalid profile number", tag="PROFILES")
except ValueError:
self.logger.error("Please enter a valid number", tag="PROFILES")
elif choice == exit_option:
# Exit
self.logger.info("Exiting profile management", tag="MENU")
break
else:
self.logger.error(f"Invalid choice. Please enter a number between 1 and {exit_option}.", tag="MENU")

View File

@ -3,12 +3,11 @@ from typing import Optional, Literal, List, Dict, Tuple
import re
from abc import ABC, abstractmethod
import random
from fake_useragent import UserAgent
import requests
from lxml import html
import json
from typing import Optional, List, Union, Dict
from typing import Union
class UAGen(ABC):
@abstractmethod

View File

@ -16,9 +16,9 @@ async def main():
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
markdown_generator=DefaultMarkdownGenerator(
content_filter=PruningContentFilter(
threshold=0.48, threshold_type="fixed", min_word_threshold=0
)
# content_filter=PruningContentFilter(
# threshold=0.48, threshold_type="fixed", min_word_threshold=0
# )
),
)
result : CrawlResult = await crawler.arun(

View File

@ -0,0 +1,108 @@
"""
Identity-Based Browsing Example with Crawl4AI
This example demonstrates how to:
1. Create a persistent browser profile interactively
2. List available profiles
3. Use a saved profile for crawling authenticated sites
4. Delete profiles when no longer needed
Uses the new BrowserProfiler class for profile management.
"""
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig
from crawl4ai.browser_profiler import BrowserProfiler
from crawl4ai.async_logger import AsyncLogger
from colorama import Fore, Style, init
# Initialize colorama
init()
# Create a shared logger instance
logger = AsyncLogger(verbose=True)
# Create a shared BrowserProfiler instance
profiler = BrowserProfiler(logger=logger)
async def crawl_with_profile(profile_path, url):
"""Use a profile to crawl an authenticated page"""
logger.info(f"\nCrawling {Fore.CYAN}{url}{Style.RESET_ALL} using profile at {Fore.YELLOW}{profile_path}{Style.RESET_ALL}", tag="CRAWL")
# Create browser config with the profile path
browser_config = BrowserConfig(
headless=True, # Set to False if you want to see the browser window
use_managed_browser=True, # Required for persistent profiles
user_data_dir=profile_path
)
start_time = asyncio.get_event_loop().time()
# Initialize crawler with the browser config
async with AsyncWebCrawler(config=browser_config) as crawler:
# Crawl the URL - You should have access to authenticated content now
result = await crawler.arun(url)
elapsed_time = asyncio.get_event_loop().time() - start_time
if result.success:
# Use url_status method for consistent logging
logger.url_status(url, True, elapsed_time, tag="CRAWL")
# Print page title or some indication of success
title = result.metadata.get("title", "")
logger.success(f"Page title: {Fore.GREEN}{title}{Style.RESET_ALL}", tag="CRAWL")
return result
else:
# Log error status
logger.error_status(url, result.error_message, tag="CRAWL")
return None
async def main():
logger.info(f"{Fore.CYAN}Identity-Based Browsing Example with Crawl4AI{Style.RESET_ALL}", tag="DEMO")
logger.info("This example demonstrates using profiles for authenticated browsing", tag="DEMO")
# Choose between interactive mode and automatic mode
mode = input(f"{Fore.CYAN}Run in [i]nteractive mode or [a]utomatic mode? (i/a): {Style.RESET_ALL}").lower()
if mode == 'i':
# Interactive profile management - use the interactive_manager method
# Pass the crawl_with_profile function as the callback for the "crawl a website" option
await profiler.interactive_manager(crawl_callback=crawl_with_profile)
else:
# Automatic mode - simplified example
profiles = profiler.list_profiles()
if not profiles:
# Create a new profile if none exists
logger.info("No profiles found. Creating a new one...", tag="DEMO")
profile_path = await profiler.create_profile()
if not profile_path:
logger.error("Cannot proceed without a valid profile", tag="DEMO")
return
else:
# Use the first (most recent) profile
profile_path = profiles[0]["path"]
logger.info(f"Using existing profile: {Fore.CYAN}{profiles[0]['name']}{Style.RESET_ALL}", tag="DEMO")
# Example: Crawl an authenticated page
urls_to_crawl = [
"https://github.com/settings/profile", # GitHub requires login
# "https://twitter.com/home", # Twitter requires login
# "https://www.linkedin.com/feed/", # LinkedIn requires login
]
for url in urls_to_crawl:
await crawl_with_profile(profile_path, url)
if __name__ == "__main__":
try:
# Run the async main function
asyncio.run(main())
except KeyboardInterrupt:
logger.warning("Example interrupted by user", tag="DEMO")
except Exception as e:
logger.error(f"Error in example: {str(e)}", tag="DEMO")

View File

@ -167,13 +167,114 @@ async with AsyncWebCrawler() as crawler:
---
## 6. Summary
## 6. Using the BrowserProfiler Class
- **Create** your user-data directory by launching Chrome/Chromium externally with `--user-data-dir=/some/path`.
- **Log in** or configure sites as needed, then close the browser.
- **Reference** that folder in `BrowserConfig(user_data_dir="...")` + `use_managed_browser=True`.
- Enjoy **persistent** sessions that reflect your real identity.
- If you only need quick, ephemeral automation, **Magic Mode** might suffice.
Crawl4AI provides a dedicated `BrowserProfiler` class for managing browser profiles, making it easy to create, list, and delete profiles for identity-based browsing.
### Creating and Managing Profiles with BrowserProfiler
The `BrowserProfiler` class offers a comprehensive API for browser profile management:
```python
import asyncio
from crawl4ai import BrowserProfiler
async def manage_profiles():
# Create a profiler instance
profiler = BrowserProfiler()
# Create a profile interactively - opens a browser window
profile_path = await profiler.create_profile(
profile_name="my-login-profile" # Optional: name your profile
)
print(f"Profile saved at: {profile_path}")
# List all available profiles
profiles = profiler.list_profiles()
for profile in profiles:
print(f"Profile: {profile['name']}")
print(f" Path: {profile['path']}")
print(f" Created: {profile['created']}")
print(f" Browser type: {profile['type']}")
# Get a specific profile path by name
specific_profile = profiler.get_profile_path("my-login-profile")
# Delete a profile when no longer needed
success = profiler.delete_profile("old-profile-name")
asyncio.run(manage_profiles())
```
**How profile creation works:**
1. A browser window opens for you to interact with
2. You log in to websites, set preferences, etc.
3. When you're done, press 'q' in the terminal to close the browser
4. The profile is saved in the Crawl4AI profiles directory
5. You can use the returned path with `BrowserConfig.user_data_dir`
### Interactive Profile Management
The `BrowserProfiler` also offers an interactive management console that guides you through profile creation, listing, and deletion:
```python
import asyncio
from crawl4ai import BrowserProfiler, AsyncWebCrawler, BrowserConfig
# Define a function to use a profile for crawling
async def crawl_with_profile(profile_path, url):
browser_config = BrowserConfig(
headless=True,
use_managed_browser=True,
user_data_dir=profile_path
)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(url)
return result
async def main():
# Create a profiler instance
profiler = BrowserProfiler()
# Launch the interactive profile manager
# Passing the crawl function as a callback adds a "crawl with profile" option
await profiler.interactive_manager(crawl_callback=crawl_with_profile)
asyncio.run(main())
```
### Legacy Methods
For backward compatibility, the previous methods on `ManagedBrowser` are still available, but they delegate to the new `BrowserProfiler` class:
```python
from crawl4ai.browser_manager import ManagedBrowser
# These methods still work but use BrowserProfiler internally
profiles = ManagedBrowser.list_profiles()
```
### Complete Example
See the full example in `docs/examples/identity_based_browsing.py` for a complete demonstration of creating and using profiles for authenticated browsing using the new `BrowserProfiler` class.
---
## 7. Summary
- **Create** your user-data directory either:
- By launching Chrome/Chromium externally with `--user-data-dir=/some/path`
- Or by using the built-in `BrowserProfiler.create_profile()` method
- Or through the interactive interface with `profiler.interactive_manager()`
- **Log in** or configure sites as needed, then close the browser
- **Reference** that folder in `BrowserConfig(user_data_dir="...")` + `use_managed_browser=True`
- **List and reuse** profiles with `BrowserProfiler.list_profiles()`
- **Manage** your profiles with the dedicated `BrowserProfiler` class
- Enjoy **persistent** sessions that reflect your real identity
- If you only need quick, ephemeral automation, **Magic Mode** might suffice
**Recommended**: Always prefer a **Managed Browser** for robust, identity-based crawling and simpler interactions with complex sites. Use **Magic Mode** for quick tasks or prototypes where persistent data is unnecessary.

View File

@ -36,7 +36,7 @@ dependencies = [
"aiofiles",
"rich>=13.9.4",
"cssselect>=1.2.0",
"httpx==0.27.2",
"httpx>=0.27.2",
"fake-useragent>=2.0.3",
"click>=8.1.7",
"pyperclip>=1.8.2",