2025-03-02 20:32:29 +08:00
|
|
|
"""
|
|
|
|
Identity-Based Browsing Example with Crawl4AI
|
|
|
|
|
|
|
|
This example demonstrates how to:
|
|
|
|
1. Create a persistent browser profile interactively
|
|
|
|
2. List available profiles
|
|
|
|
3. Use a saved profile for crawling authenticated sites
|
|
|
|
4. Delete profiles when no longer needed
|
|
|
|
|
|
|
|
Uses the new BrowserProfiler class for profile management.
|
|
|
|
"""
|
|
|
|
|
|
|
|
import asyncio
|
|
|
|
from crawl4ai import AsyncWebCrawler, BrowserConfig
|
|
|
|
from crawl4ai.browser_profiler import BrowserProfiler
|
|
|
|
from crawl4ai.async_logger import AsyncLogger
|
|
|
|
from colorama import Fore, Style, init
|
|
|
|
|
|
|
|
# Initialize colorama
|
|
|
|
init()
|
|
|
|
|
|
|
|
# Create a shared logger instance
|
|
|
|
logger = AsyncLogger(verbose=True)
|
|
|
|
|
|
|
|
# Create a shared BrowserProfiler instance
|
|
|
|
profiler = BrowserProfiler(logger=logger)
|
|
|
|
|
|
|
|
|
|
|
|
async def crawl_with_profile(profile_path, url):
|
|
|
|
"""Use a profile to crawl an authenticated page"""
|
|
|
|
logger.info(f"\nCrawling {Fore.CYAN}{url}{Style.RESET_ALL} using profile at {Fore.YELLOW}{profile_path}{Style.RESET_ALL}", tag="CRAWL")
|
|
|
|
|
|
|
|
# Create browser config with the profile path
|
|
|
|
browser_config = BrowserConfig(
|
2025-03-02 21:33:33 +08:00
|
|
|
headless=False, # Set to False if you want to see the browser window
|
2025-03-02 20:32:29 +08:00
|
|
|
use_managed_browser=True, # Required for persistent profiles
|
|
|
|
user_data_dir=profile_path
|
|
|
|
)
|
|
|
|
|
|
|
|
start_time = asyncio.get_event_loop().time()
|
|
|
|
|
|
|
|
# Initialize crawler with the browser config
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
|
|
# Crawl the URL - You should have access to authenticated content now
|
|
|
|
result = await crawler.arun(url)
|
|
|
|
|
|
|
|
elapsed_time = asyncio.get_event_loop().time() - start_time
|
|
|
|
|
|
|
|
if result.success:
|
|
|
|
# Use url_status method for consistent logging
|
|
|
|
logger.url_status(url, True, elapsed_time, tag="CRAWL")
|
|
|
|
|
|
|
|
# Print page title or some indication of success
|
|
|
|
title = result.metadata.get("title", "")
|
|
|
|
logger.success(f"Page title: {Fore.GREEN}{title}{Style.RESET_ALL}", tag="CRAWL")
|
|
|
|
return result
|
|
|
|
else:
|
|
|
|
# Log error status
|
|
|
|
logger.error_status(url, result.error_message, tag="CRAWL")
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
async def main():
|
|
|
|
logger.info(f"{Fore.CYAN}Identity-Based Browsing Example with Crawl4AI{Style.RESET_ALL}", tag="DEMO")
|
|
|
|
logger.info("This example demonstrates using profiles for authenticated browsing", tag="DEMO")
|
|
|
|
|
|
|
|
# Choose between interactive mode and automatic mode
|
|
|
|
mode = input(f"{Fore.CYAN}Run in [i]nteractive mode or [a]utomatic mode? (i/a): {Style.RESET_ALL}").lower()
|
|
|
|
|
|
|
|
if mode == 'i':
|
|
|
|
# Interactive profile management - use the interactive_manager method
|
|
|
|
# Pass the crawl_with_profile function as the callback for the "crawl a website" option
|
|
|
|
await profiler.interactive_manager(crawl_callback=crawl_with_profile)
|
|
|
|
else:
|
|
|
|
# Automatic mode - simplified example
|
|
|
|
profiles = profiler.list_profiles()
|
|
|
|
|
|
|
|
if not profiles:
|
|
|
|
# Create a new profile if none exists
|
|
|
|
logger.info("No profiles found. Creating a new one...", tag="DEMO")
|
|
|
|
profile_path = await profiler.create_profile()
|
|
|
|
if not profile_path:
|
|
|
|
logger.error("Cannot proceed without a valid profile", tag="DEMO")
|
|
|
|
return
|
|
|
|
else:
|
|
|
|
# Use the first (most recent) profile
|
|
|
|
profile_path = profiles[0]["path"]
|
|
|
|
logger.info(f"Using existing profile: {Fore.CYAN}{profiles[0]['name']}{Style.RESET_ALL}", tag="DEMO")
|
|
|
|
|
|
|
|
# Example: Crawl an authenticated page
|
|
|
|
urls_to_crawl = [
|
|
|
|
"https://github.com/settings/profile", # GitHub requires login
|
|
|
|
# "https://twitter.com/home", # Twitter requires login
|
|
|
|
# "https://www.linkedin.com/feed/", # LinkedIn requires login
|
|
|
|
]
|
|
|
|
|
|
|
|
for url in urls_to_crawl:
|
|
|
|
await crawl_with_profile(profile_path, url)
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
try:
|
|
|
|
# Run the async main function
|
|
|
|
asyncio.run(main())
|
|
|
|
except KeyboardInterrupt:
|
|
|
|
logger.warning("Example interrupted by user", tag="DEMO")
|
|
|
|
except Exception as e:
|
|
|
|
logger.error(f"Error in example: {str(e)}", tag="DEMO")
|