
Adds new functionality to crawl websites using saved browser profiles directly from the CLI. This includes: - New CLI option to use profiles for crawling - Helper functions for profile-based crawling - Fixed type hints for config parameters - Updated example to show browser window by default This makes it easier for users to leverage saved browser profiles for crawling without writing code.
108 lines
4.1 KiB
Python
108 lines
4.1 KiB
Python
"""
|
|
Identity-Based Browsing Example with Crawl4AI
|
|
|
|
This example demonstrates how to:
|
|
1. Create a persistent browser profile interactively
|
|
2. List available profiles
|
|
3. Use a saved profile for crawling authenticated sites
|
|
4. Delete profiles when no longer needed
|
|
|
|
Uses the new BrowserProfiler class for profile management.
|
|
"""
|
|
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, BrowserConfig
|
|
from crawl4ai.browser_profiler import BrowserProfiler
|
|
from crawl4ai.async_logger import AsyncLogger
|
|
from colorama import Fore, Style, init
|
|
|
|
# Initialize colorama
|
|
init()
|
|
|
|
# Create a shared logger instance
|
|
logger = AsyncLogger(verbose=True)
|
|
|
|
# Create a shared BrowserProfiler instance
|
|
profiler = BrowserProfiler(logger=logger)
|
|
|
|
|
|
async def crawl_with_profile(profile_path, url):
|
|
"""Use a profile to crawl an authenticated page"""
|
|
logger.info(f"\nCrawling {Fore.CYAN}{url}{Style.RESET_ALL} using profile at {Fore.YELLOW}{profile_path}{Style.RESET_ALL}", tag="CRAWL")
|
|
|
|
# Create browser config with the profile path
|
|
browser_config = BrowserConfig(
|
|
headless=False, # Set to False if you want to see the browser window
|
|
use_managed_browser=True, # Required for persistent profiles
|
|
user_data_dir=profile_path
|
|
)
|
|
|
|
start_time = asyncio.get_event_loop().time()
|
|
|
|
# Initialize crawler with the browser config
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
# Crawl the URL - You should have access to authenticated content now
|
|
result = await crawler.arun(url)
|
|
|
|
elapsed_time = asyncio.get_event_loop().time() - start_time
|
|
|
|
if result.success:
|
|
# Use url_status method for consistent logging
|
|
logger.url_status(url, True, elapsed_time, tag="CRAWL")
|
|
|
|
# Print page title or some indication of success
|
|
title = result.metadata.get("title", "")
|
|
logger.success(f"Page title: {Fore.GREEN}{title}{Style.RESET_ALL}", tag="CRAWL")
|
|
return result
|
|
else:
|
|
# Log error status
|
|
logger.error_status(url, result.error_message, tag="CRAWL")
|
|
return None
|
|
|
|
|
|
async def main():
|
|
logger.info(f"{Fore.CYAN}Identity-Based Browsing Example with Crawl4AI{Style.RESET_ALL}", tag="DEMO")
|
|
logger.info("This example demonstrates using profiles for authenticated browsing", tag="DEMO")
|
|
|
|
# Choose between interactive mode and automatic mode
|
|
mode = input(f"{Fore.CYAN}Run in [i]nteractive mode or [a]utomatic mode? (i/a): {Style.RESET_ALL}").lower()
|
|
|
|
if mode == 'i':
|
|
# Interactive profile management - use the interactive_manager method
|
|
# Pass the crawl_with_profile function as the callback for the "crawl a website" option
|
|
await profiler.interactive_manager(crawl_callback=crawl_with_profile)
|
|
else:
|
|
# Automatic mode - simplified example
|
|
profiles = profiler.list_profiles()
|
|
|
|
if not profiles:
|
|
# Create a new profile if none exists
|
|
logger.info("No profiles found. Creating a new one...", tag="DEMO")
|
|
profile_path = await profiler.create_profile()
|
|
if not profile_path:
|
|
logger.error("Cannot proceed without a valid profile", tag="DEMO")
|
|
return
|
|
else:
|
|
# Use the first (most recent) profile
|
|
profile_path = profiles[0]["path"]
|
|
logger.info(f"Using existing profile: {Fore.CYAN}{profiles[0]['name']}{Style.RESET_ALL}", tag="DEMO")
|
|
|
|
# Example: Crawl an authenticated page
|
|
urls_to_crawl = [
|
|
"https://github.com/settings/profile", # GitHub requires login
|
|
# "https://twitter.com/home", # Twitter requires login
|
|
# "https://www.linkedin.com/feed/", # LinkedIn requires login
|
|
]
|
|
|
|
for url in urls_to_crawl:
|
|
await crawl_with_profile(profile_path, url)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
try:
|
|
# Run the async main function
|
|
asyncio.run(main())
|
|
except KeyboardInterrupt:
|
|
logger.warning("Example interrupted by user", tag="DEMO")
|
|
except Exception as e:
|
|
logger.error(f"Error in example: {str(e)}", tag="DEMO") |