
Implements a new browser strategy that runs Chrome in Docker containers, providing better isolation and cross-platform consistency. Features include: - Connect and launch modes for different container configurations - Persistent storage support for maintaining browser state - Container registry for efficient reuse - Comprehensive test suite for Docker browser functionality This addition allows users to run browser automation workloads in isolated containers, improving security and resource management.
653 lines
23 KiB
Python
653 lines
23 KiB
Python
"""Test examples for Docker Browser Strategy.
|
|
|
|
These examples demonstrate the functionality of Docker Browser Strategy
|
|
and serve as functional tests.
|
|
"""
|
|
|
|
import asyncio
|
|
import os
|
|
import sys
|
|
import shutil
|
|
import uuid
|
|
import json
|
|
from typing import List, Dict, Any, Optional, Tuple
|
|
|
|
# Add the project root to Python path if running directly
|
|
if __name__ == "__main__":
|
|
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../..')))
|
|
|
|
from crawl4ai.browser import BrowserManager
|
|
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
|
from crawl4ai.async_logger import AsyncLogger
|
|
from crawl4ai.browser.docker_config import DockerConfig
|
|
from crawl4ai.browser.docker_registry import DockerRegistry
|
|
from crawl4ai.browser.docker_utils import DockerUtils
|
|
|
|
# Create a logger for clear terminal output
|
|
logger = AsyncLogger(verbose=True, log_file=None)
|
|
|
|
# Global Docker utils instance
|
|
docker_utils = DockerUtils(logger)
|
|
|
|
async def test_docker_components():
|
|
"""Test Docker utilities, registry, and image building.
|
|
|
|
This function tests the core Docker components before running the browser tests.
|
|
It validates DockerRegistry, DockerUtils, and builds test images to ensure
|
|
everything is functioning correctly.
|
|
"""
|
|
logger.info("Testing Docker components", tag="SETUP")
|
|
|
|
# Create a test registry directory
|
|
registry_dir = os.path.join(os.path.dirname(__file__), "test_registry")
|
|
registry_file = os.path.join(registry_dir, "test_registry.json")
|
|
os.makedirs(registry_dir, exist_ok=True)
|
|
|
|
try:
|
|
# 1. Test DockerRegistry
|
|
logger.info("Testing DockerRegistry...", tag="SETUP")
|
|
registry = DockerRegistry(registry_file)
|
|
|
|
# Test saving and loading registry
|
|
test_container_id = "test-container-123"
|
|
registry.register_container(test_container_id, 9876, "test-hash-123")
|
|
registry.save()
|
|
|
|
# Create a new registry instance that loads from the file
|
|
registry2 = DockerRegistry(registry_file)
|
|
port = registry2.get_container_host_port(test_container_id)
|
|
hash_value = registry2.get_container_config_hash(test_container_id)
|
|
|
|
if port != 9876 or hash_value != "test-hash-123":
|
|
logger.error("DockerRegistry persistence failed", tag="SETUP")
|
|
return False
|
|
|
|
# Clean up test container from registry
|
|
registry2.unregister_container(test_container_id)
|
|
logger.success("DockerRegistry works correctly", tag="SETUP")
|
|
|
|
# 2. Test DockerUtils
|
|
logger.info("Testing DockerUtils...", tag="SETUP")
|
|
|
|
# Test port detection
|
|
in_use = docker_utils.is_port_in_use(22) # SSH port is usually in use
|
|
logger.info(f"Port 22 in use: {in_use}", tag="SETUP")
|
|
|
|
# Get next available port
|
|
available_port = docker_utils.get_next_available_port(9000)
|
|
logger.info(f"Next available port: {available_port}", tag="SETUP")
|
|
|
|
# Test config hash generation
|
|
config_dict = {"mode": "connect", "headless": True}
|
|
config_hash = docker_utils.generate_config_hash(config_dict)
|
|
logger.info(f"Generated config hash: {config_hash[:8]}...", tag="SETUP")
|
|
|
|
# 3. Test Docker is available
|
|
logger.info("Checking Docker availability...", tag="SETUP")
|
|
if not await check_docker_available():
|
|
logger.error("Docker is not available - cannot continue tests", tag="SETUP")
|
|
return False
|
|
|
|
# 4. Test building connect image
|
|
logger.info("Building connect mode Docker image...", tag="SETUP")
|
|
connect_image = await docker_utils.ensure_docker_image_exists(None, "connect")
|
|
if not connect_image:
|
|
logger.error("Failed to build connect mode image", tag="SETUP")
|
|
return False
|
|
logger.success(f"Successfully built connect image: {connect_image}", tag="SETUP")
|
|
|
|
# 5. Test building launch image
|
|
logger.info("Building launch mode Docker image...", tag="SETUP")
|
|
launch_image = await docker_utils.ensure_docker_image_exists(None, "launch")
|
|
if not launch_image:
|
|
logger.error("Failed to build launch mode image", tag="SETUP")
|
|
return False
|
|
logger.success(f"Successfully built launch image: {launch_image}", tag="SETUP")
|
|
|
|
# 6. Test creating and removing container
|
|
logger.info("Testing container creation and removal...", tag="SETUP")
|
|
container_id = await docker_utils.create_container(
|
|
image_name=launch_image,
|
|
host_port=available_port,
|
|
container_name="crawl4ai-test-container"
|
|
)
|
|
|
|
if not container_id:
|
|
logger.error("Failed to create test container", tag="SETUP")
|
|
return False
|
|
|
|
logger.info(f"Created test container: {container_id[:12]}", tag="SETUP")
|
|
|
|
# Verify container is running
|
|
running = await docker_utils.is_container_running(container_id)
|
|
if not running:
|
|
logger.error("Test container is not running", tag="SETUP")
|
|
await docker_utils.remove_container(container_id)
|
|
return False
|
|
|
|
# Test commands in container
|
|
logger.info("Testing command execution in container...", tag="SETUP")
|
|
returncode, stdout, stderr = await docker_utils.exec_in_container(
|
|
container_id, ["ls", "-la", "/"]
|
|
)
|
|
|
|
if returncode != 0:
|
|
logger.error(f"Command execution failed: {stderr}", tag="SETUP")
|
|
await docker_utils.remove_container(container_id)
|
|
return False
|
|
|
|
# Verify Chrome is installed in the container
|
|
returncode, stdout, stderr = await docker_utils.exec_in_container(
|
|
container_id, ["which", "google-chrome"]
|
|
)
|
|
|
|
if returncode != 0:
|
|
logger.error("Chrome not found in container", tag="SETUP")
|
|
await docker_utils.remove_container(container_id)
|
|
return False
|
|
|
|
chrome_path = stdout.strip()
|
|
logger.info(f"Chrome found at: {chrome_path}", tag="SETUP")
|
|
|
|
# Test Chrome version
|
|
returncode, stdout, stderr = await docker_utils.exec_in_container(
|
|
container_id, ["google-chrome", "--version"]
|
|
)
|
|
|
|
if returncode != 0:
|
|
logger.error(f"Failed to get Chrome version: {stderr}", tag="SETUP")
|
|
await docker_utils.remove_container(container_id)
|
|
return False
|
|
|
|
logger.info(f"Chrome version: {stdout.strip()}", tag="SETUP")
|
|
|
|
# Remove test container
|
|
removed = await docker_utils.remove_container(container_id)
|
|
if not removed:
|
|
logger.error("Failed to remove test container", tag="SETUP")
|
|
return False
|
|
|
|
logger.success("Test container removed successfully", tag="SETUP")
|
|
|
|
# All components tested successfully
|
|
logger.success("All Docker components tested successfully", tag="SETUP")
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"Docker component tests failed: {str(e)}", tag="SETUP")
|
|
return False
|
|
finally:
|
|
# Clean up registry test directory
|
|
if os.path.exists(registry_dir):
|
|
shutil.rmtree(registry_dir)
|
|
|
|
async def test_docker_connect_mode():
|
|
"""Test Docker browser in connect mode.
|
|
|
|
This tests the basic functionality of creating a browser in Docker
|
|
connect mode and using it for navigation.
|
|
"""
|
|
logger.info("Testing Docker browser in connect mode", tag="TEST")
|
|
|
|
# Create temp directory for user data
|
|
temp_dir = os.path.join(os.path.dirname(__file__), "tmp_user_data")
|
|
os.makedirs(temp_dir, exist_ok=True)
|
|
|
|
try:
|
|
# Create Docker configuration
|
|
docker_config = DockerConfig(
|
|
mode="connect",
|
|
persistent=False,
|
|
remove_on_exit=True,
|
|
user_data_dir=temp_dir
|
|
)
|
|
|
|
# Create browser configuration
|
|
browser_config = BrowserConfig(
|
|
browser_mode="docker",
|
|
headless=True,
|
|
docker_config=docker_config
|
|
)
|
|
|
|
# Create browser manager
|
|
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
|
|
|
# Start the browser
|
|
await manager.start()
|
|
logger.info("Browser started successfully", tag="TEST")
|
|
|
|
# Create crawler config
|
|
crawler_config = CrawlerRunConfig(url="https://example.com")
|
|
|
|
# Get a page
|
|
page, context = await manager.get_page(crawler_config)
|
|
logger.info("Got page successfully", tag="TEST")
|
|
|
|
# Navigate to a website
|
|
await page.goto("https://example.com")
|
|
logger.info("Navigated to example.com", tag="TEST")
|
|
|
|
# Get page title
|
|
title = await page.title()
|
|
logger.info(f"Page title: {title}", tag="TEST")
|
|
|
|
# Clean up
|
|
await manager.close()
|
|
logger.info("Browser closed successfully", tag="TEST")
|
|
|
|
return True
|
|
except Exception as e:
|
|
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
|
# Ensure cleanup
|
|
try:
|
|
await manager.close()
|
|
except:
|
|
pass
|
|
return False
|
|
finally:
|
|
# Clean up the temp directory
|
|
if os.path.exists(temp_dir):
|
|
shutil.rmtree(temp_dir)
|
|
|
|
async def test_docker_launch_mode():
|
|
"""Test Docker browser in launch mode.
|
|
|
|
This tests launching a Chrome browser within a Docker container
|
|
on demand with custom settings.
|
|
"""
|
|
logger.info("Testing Docker browser in launch mode", tag="TEST")
|
|
|
|
# Create temp directory for user data
|
|
temp_dir = os.path.join(os.path.dirname(__file__), "tmp_user_data_launch")
|
|
os.makedirs(temp_dir, exist_ok=True)
|
|
|
|
try:
|
|
# Create Docker configuration
|
|
docker_config = DockerConfig(
|
|
mode="launch",
|
|
persistent=False,
|
|
remove_on_exit=True,
|
|
user_data_dir=temp_dir
|
|
)
|
|
|
|
# Create browser configuration
|
|
browser_config = BrowserConfig(
|
|
browser_mode="docker",
|
|
headless=True,
|
|
text_mode=True, # Enable text mode for faster operation
|
|
docker_config=docker_config
|
|
)
|
|
|
|
# Create browser manager
|
|
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
|
|
|
# Start the browser
|
|
await manager.start()
|
|
logger.info("Browser started successfully", tag="TEST")
|
|
|
|
# Create crawler config
|
|
crawler_config = CrawlerRunConfig(url="https://example.com")
|
|
|
|
# Get a page
|
|
page, context = await manager.get_page(crawler_config)
|
|
logger.info("Got page successfully", tag="TEST")
|
|
|
|
# Navigate to a website
|
|
await page.goto("https://example.com")
|
|
logger.info("Navigated to example.com", tag="TEST")
|
|
|
|
# Get page title
|
|
title = await page.title()
|
|
logger.info(f"Page title: {title}", tag="TEST")
|
|
|
|
# Clean up
|
|
await manager.close()
|
|
logger.info("Browser closed successfully", tag="TEST")
|
|
|
|
return True
|
|
except Exception as e:
|
|
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
|
# Ensure cleanup
|
|
try:
|
|
await manager.close()
|
|
except:
|
|
pass
|
|
return False
|
|
finally:
|
|
# Clean up the temp directory
|
|
if os.path.exists(temp_dir):
|
|
shutil.rmtree(temp_dir)
|
|
|
|
async def test_docker_persistent_storage():
|
|
"""Test Docker browser with persistent storage.
|
|
|
|
This tests creating localStorage data in one session and verifying
|
|
it persists to another session when using persistent storage.
|
|
"""
|
|
logger.info("Testing Docker browser with persistent storage", tag="TEST")
|
|
|
|
# Create a unique temp directory
|
|
test_id = uuid.uuid4().hex[:8]
|
|
temp_dir = os.path.join(os.path.dirname(__file__), f"tmp_user_data_persist_{test_id}")
|
|
os.makedirs(temp_dir, exist_ok=True)
|
|
|
|
manager1 = None
|
|
manager2 = None
|
|
|
|
try:
|
|
# Create Docker configuration with persistence
|
|
docker_config = DockerConfig(
|
|
mode="connect",
|
|
persistent=True, # Keep container running between sessions
|
|
user_data_dir=temp_dir,
|
|
container_user_data_dir="/data"
|
|
)
|
|
|
|
# Create browser configuration
|
|
browser_config = BrowserConfig(
|
|
browser_mode="docker",
|
|
headless=True,
|
|
docker_config=docker_config
|
|
)
|
|
|
|
# Create first browser manager
|
|
manager1 = BrowserManager(browser_config=browser_config, logger=logger)
|
|
|
|
# Start the browser
|
|
await manager1.start()
|
|
logger.info("First browser started successfully", tag="TEST")
|
|
|
|
# Create crawler config
|
|
crawler_config = CrawlerRunConfig()
|
|
|
|
# Get a page
|
|
page1, context1 = await manager1.get_page(crawler_config)
|
|
|
|
# Navigate to example.com
|
|
await page1.goto("https://example.com")
|
|
|
|
# Set localStorage item
|
|
test_value = f"test_value_{test_id}"
|
|
await page1.evaluate(f"localStorage.setItem('test_key', '{test_value}')")
|
|
logger.info(f"Set localStorage test_key = {test_value}", tag="TEST")
|
|
|
|
# Close the first browser manager
|
|
await manager1.close()
|
|
logger.info("First browser closed", tag="TEST")
|
|
|
|
# Create second browser manager with same config
|
|
manager2 = BrowserManager(browser_config=browser_config, logger=logger)
|
|
|
|
# Start the browser
|
|
await manager2.start()
|
|
logger.info("Second browser started successfully", tag="TEST")
|
|
|
|
# Get a page
|
|
page2, context2 = await manager2.get_page(crawler_config)
|
|
|
|
# Navigate to same site
|
|
await page2.goto("https://example.com")
|
|
|
|
# Get localStorage item
|
|
value = await page2.evaluate("localStorage.getItem('test_key')")
|
|
logger.info(f"Retrieved localStorage test_key = {value}", tag="TEST")
|
|
|
|
# Check if persistence worked
|
|
if value == test_value:
|
|
logger.success("Storage persistence verified!", tag="TEST")
|
|
else:
|
|
logger.error(f"Storage persistence failed! Expected {test_value}, got {value}", tag="TEST")
|
|
|
|
# Clean up
|
|
await manager2.close()
|
|
logger.info("Second browser closed successfully", tag="TEST")
|
|
|
|
return value == test_value
|
|
except Exception as e:
|
|
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
|
# Ensure cleanup
|
|
try:
|
|
if manager1:
|
|
await manager1.close()
|
|
if manager2:
|
|
await manager2.close()
|
|
except:
|
|
pass
|
|
return False
|
|
finally:
|
|
# Clean up the temp directory
|
|
if os.path.exists(temp_dir):
|
|
shutil.rmtree(temp_dir)
|
|
|
|
async def test_docker_parallel_pages():
|
|
"""Test Docker browser with parallel page creation.
|
|
|
|
This tests the ability to create and use multiple pages in parallel
|
|
from a single Docker browser instance.
|
|
"""
|
|
logger.info("Testing Docker browser with parallel pages", tag="TEST")
|
|
|
|
try:
|
|
# Create Docker configuration
|
|
docker_config = DockerConfig(
|
|
mode="connect",
|
|
persistent=False,
|
|
remove_on_exit=True
|
|
)
|
|
|
|
# Create browser configuration
|
|
browser_config = BrowserConfig(
|
|
browser_mode="docker",
|
|
headless=True,
|
|
docker_config=docker_config
|
|
)
|
|
|
|
# Create browser manager
|
|
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
|
|
|
# Start the browser
|
|
await manager.start()
|
|
logger.info("Browser started successfully", tag="TEST")
|
|
|
|
# Create crawler config
|
|
crawler_config = CrawlerRunConfig()
|
|
|
|
# Get multiple pages
|
|
page_count = 3
|
|
pages = await manager.get_pages(crawler_config, count=page_count)
|
|
logger.info(f"Got {len(pages)} pages successfully", tag="TEST")
|
|
|
|
if len(pages) != page_count:
|
|
logger.error(f"Expected {page_count} pages, got {len(pages)}", tag="TEST")
|
|
await manager.close()
|
|
return False
|
|
|
|
# Navigate to different sites with each page
|
|
tasks = []
|
|
for i, (page, _) in enumerate(pages):
|
|
tasks.append(page.goto(f"https://example.com?page={i}"))
|
|
|
|
# Wait for all navigations to complete
|
|
await asyncio.gather(*tasks)
|
|
logger.info("All pages navigated successfully", tag="TEST")
|
|
|
|
# Get titles from all pages
|
|
titles = []
|
|
for i, (page, _) in enumerate(pages):
|
|
title = await page.title()
|
|
titles.append(title)
|
|
logger.info(f"Page {i+1} title: {title}", tag="TEST")
|
|
|
|
# Clean up
|
|
await manager.close()
|
|
logger.info("Browser closed successfully", tag="TEST")
|
|
|
|
return True
|
|
except Exception as e:
|
|
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
|
# Ensure cleanup
|
|
try:
|
|
await manager.close()
|
|
except:
|
|
pass
|
|
return False
|
|
|
|
async def test_docker_registry_reuse():
|
|
"""Test Docker container reuse via registry.
|
|
|
|
This tests that containers with matching configurations
|
|
are reused rather than creating new ones.
|
|
"""
|
|
logger.info("Testing Docker container reuse via registry", tag="TEST")
|
|
|
|
# Create registry for this test
|
|
registry_dir = os.path.join(os.path.dirname(__file__), "registry_reuse_test")
|
|
registry_file = os.path.join(registry_dir, "registry.json")
|
|
os.makedirs(registry_dir, exist_ok=True)
|
|
|
|
manager1 = None
|
|
manager2 = None
|
|
container_id1 = None
|
|
|
|
try:
|
|
# Create identical Docker configurations with custom registry
|
|
docker_config1 = DockerConfig(
|
|
mode="connect",
|
|
persistent=True, # Keep container running after closing
|
|
registry_file=registry_file
|
|
)
|
|
|
|
# Create first browser configuration
|
|
browser_config1 = BrowserConfig(
|
|
browser_mode="docker",
|
|
headless=True,
|
|
docker_config=docker_config1
|
|
)
|
|
|
|
# Create first browser manager
|
|
manager1 = BrowserManager(browser_config=browser_config1, logger=logger)
|
|
|
|
# Start the first browser
|
|
await manager1.start()
|
|
logger.info("First browser started successfully", tag="TEST")
|
|
|
|
# Get container ID from the strategy
|
|
docker_strategy1 = manager1._strategy
|
|
container_id1 = docker_strategy1.container_id
|
|
logger.info(f"First browser container ID: {container_id1[:12]}", tag="TEST")
|
|
|
|
# Close the first manager but keep container running
|
|
await manager1.close()
|
|
logger.info("First browser closed", tag="TEST")
|
|
|
|
# Create second Docker configuration identical to first
|
|
docker_config2 = DockerConfig(
|
|
mode="connect",
|
|
persistent=True,
|
|
registry_file=registry_file
|
|
)
|
|
|
|
# Create second browser configuration
|
|
browser_config2 = BrowserConfig(
|
|
browser_mode="docker",
|
|
headless=True,
|
|
docker_config=docker_config2
|
|
)
|
|
|
|
# Create second browser manager
|
|
manager2 = BrowserManager(browser_config=browser_config2, logger=logger)
|
|
|
|
# Start the second browser - should reuse existing container
|
|
await manager2.start()
|
|
logger.info("Second browser started successfully", tag="TEST")
|
|
|
|
# Get container ID from the second strategy
|
|
docker_strategy2 = manager2._strategy
|
|
container_id2 = docker_strategy2.container_id
|
|
logger.info(f"Second browser container ID: {container_id2[:12]}", tag="TEST")
|
|
|
|
# Verify container reuse
|
|
if container_id1 == container_id2:
|
|
logger.success("Container reuse successful - using same container!", tag="TEST")
|
|
else:
|
|
logger.error("Container reuse failed - new container created!", tag="TEST")
|
|
|
|
# Clean up
|
|
docker_strategy2.docker_config.persistent = False
|
|
docker_strategy2.docker_config.remove_on_exit = True
|
|
await manager2.close()
|
|
logger.info("Second browser closed and container removed", tag="TEST")
|
|
|
|
return container_id1 == container_id2
|
|
except Exception as e:
|
|
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
|
# Ensure cleanup
|
|
try:
|
|
if manager1:
|
|
await manager1.close()
|
|
if manager2:
|
|
await manager2.close()
|
|
# Make sure container is removed
|
|
if container_id1:
|
|
await docker_utils.remove_container(container_id1, force=True)
|
|
except:
|
|
pass
|
|
return False
|
|
finally:
|
|
# Clean up registry directory
|
|
if os.path.exists(registry_dir):
|
|
shutil.rmtree(registry_dir)
|
|
|
|
async def run_tests():
|
|
"""Run all tests sequentially."""
|
|
results = []
|
|
|
|
logger.info("Starting Docker Browser Strategy tests", tag="TEST")
|
|
|
|
# Check if Docker is available
|
|
if not await check_docker_available():
|
|
logger.error("Docker is not available - skipping tests", tag="TEST")
|
|
return
|
|
|
|
# First test Docker components
|
|
setup_result = await test_docker_components()
|
|
if not setup_result:
|
|
logger.error("Docker component tests failed - skipping browser tests", tag="TEST")
|
|
return
|
|
|
|
# Run browser tests
|
|
results.append(await test_docker_connect_mode())
|
|
results.append(await test_docker_launch_mode())
|
|
results.append(await test_docker_persistent_storage())
|
|
results.append(await test_docker_parallel_pages())
|
|
results.append(await test_docker_registry_reuse())
|
|
|
|
# Print summary
|
|
total = len(results)
|
|
passed = sum(1 for r in results if r)
|
|
logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY")
|
|
|
|
if passed == total:
|
|
logger.success("All tests passed!", tag="SUMMARY")
|
|
else:
|
|
logger.error(f"{total - passed} tests failed", tag="SUMMARY")
|
|
|
|
async def check_docker_available() -> bool:
|
|
"""Check if Docker is available on the system.
|
|
|
|
Returns:
|
|
bool: True if Docker is available, False otherwise
|
|
"""
|
|
try:
|
|
proc = await asyncio.create_subprocess_exec(
|
|
"docker", "--version",
|
|
stdout=asyncio.subprocess.PIPE,
|
|
stderr=asyncio.subprocess.PIPE
|
|
)
|
|
stdout, _ = await proc.communicate()
|
|
return proc.returncode == 0 and stdout
|
|
except:
|
|
return False
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(run_tests()) |