
Moves common browser functionality into BaseBrowserStrategy class to reduce code duplication and improve maintainability. Key changes: - Adds shared browser argument building and session management to base class - Standardizes storage state handling across strategies - Improves process cleanup and error handling - Consolidates CDP URL management and container lifecycle BREAKING CHANGE: Changes browser_mode="custom" to "cdp" for consistency
317 lines
11 KiB
Python
317 lines
11 KiB
Python
"""Test examples for PlaywrightBrowserStrategy.
|
|
|
|
These examples demonstrate the functionality of PlaywrightBrowserStrategy
|
|
and serve as functional tests.
|
|
"""
|
|
|
|
import asyncio
|
|
import os
|
|
import re
|
|
import sys
|
|
|
|
# Add the project root to Python path if running directly
|
|
if __name__ == "__main__":
|
|
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
|
|
|
from crawl4ai.browser import BrowserManager
|
|
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
|
from crawl4ai.async_logger import AsyncLogger
|
|
|
|
# Create a logger for clear terminal output
|
|
logger = AsyncLogger(verbose=True, log_file=None)
|
|
|
|
|
|
|
|
async def test_start_close():
|
|
# Create browser config for standard Playwright
|
|
browser_config = BrowserConfig(
|
|
headless=True,
|
|
viewport_width=1280,
|
|
viewport_height=800
|
|
)
|
|
|
|
# Create browser manager with the config
|
|
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
|
|
|
try:
|
|
for _ in range(4):
|
|
# Start the browser
|
|
await manager.start()
|
|
logger.info("Browser started successfully", tag="TEST")
|
|
|
|
# Get a page
|
|
page, context = await manager.get_page(CrawlerRunConfig())
|
|
logger.info("Got page successfully", tag="TEST")
|
|
|
|
# Navigate to a website
|
|
await page.goto("https://example.com")
|
|
logger.info("Navigated to example.com", tag="TEST")
|
|
|
|
# Get page title
|
|
title = await page.title()
|
|
logger.info(f"Page title: {title}", tag="TEST")
|
|
|
|
# Clean up
|
|
await manager.close()
|
|
logger.info("Browser closed successfully", tag="TEST")
|
|
|
|
await asyncio.sleep(1) # Wait for a moment before restarting
|
|
|
|
except Exception as e:
|
|
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
|
# Ensure cleanup
|
|
try:
|
|
await manager.close()
|
|
except:
|
|
pass
|
|
return False
|
|
return True
|
|
|
|
async def test_playwright_basic():
|
|
"""Test basic Playwright browser functionality."""
|
|
logger.info("Testing standard Playwright browser", tag="TEST")
|
|
|
|
# Create browser config for standard Playwright
|
|
browser_config = BrowserConfig(
|
|
headless=True,
|
|
viewport_width=1280,
|
|
viewport_height=800
|
|
)
|
|
|
|
# Create browser manager with the config
|
|
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
|
|
|
try:
|
|
# Start the browser
|
|
await manager.start()
|
|
logger.info("Browser started successfully", tag="TEST")
|
|
|
|
# Create crawler config
|
|
crawler_config = CrawlerRunConfig(url="https://example.com")
|
|
|
|
# Get a page
|
|
page, context = await manager.get_page(crawler_config)
|
|
logger.info("Got page successfully", tag="TEST")
|
|
|
|
# Navigate to a website
|
|
await page.goto("https://example.com")
|
|
logger.info("Navigated to example.com", tag="TEST")
|
|
|
|
# Get page title
|
|
title = await page.title()
|
|
logger.info(f"Page title: {title}", tag="TEST")
|
|
|
|
# Clean up
|
|
await manager.close()
|
|
logger.info("Browser closed successfully", tag="TEST")
|
|
|
|
return True
|
|
except Exception as e:
|
|
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
|
# Ensure cleanup
|
|
try:
|
|
await manager.close()
|
|
except:
|
|
pass
|
|
return False
|
|
|
|
async def test_playwright_text_mode():
|
|
"""Test Playwright browser in text-only mode."""
|
|
logger.info("Testing Playwright text mode", tag="TEST")
|
|
|
|
# Create browser config with text mode enabled
|
|
browser_config = BrowserConfig(
|
|
headless=True,
|
|
text_mode=True # Enable text-only mode
|
|
)
|
|
|
|
# Create browser manager with the config
|
|
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
|
|
|
try:
|
|
# Start the browser
|
|
await manager.start()
|
|
logger.info("Browser started successfully in text mode", tag="TEST")
|
|
|
|
# Get a page
|
|
crawler_config = CrawlerRunConfig(url="https://example.com")
|
|
page, context = await manager.get_page(crawler_config)
|
|
|
|
# Navigate to a website
|
|
await page.goto("https://example.com")
|
|
logger.info("Navigated to example.com", tag="TEST")
|
|
|
|
# Get page title
|
|
title = await page.title()
|
|
logger.info(f"Page title: {title}", tag="TEST")
|
|
|
|
# Check if images are blocked in text mode
|
|
# We'll check if any image requests were made
|
|
has_images = False
|
|
async with page.expect_request("**/*.{png,jpg,jpeg,gif,webp,svg}", timeout=1000) as request_info:
|
|
try:
|
|
# Try to load a page with images
|
|
await page.goto("https://picsum.photos/", wait_until="domcontentloaded")
|
|
request = await request_info.value
|
|
has_images = True
|
|
except:
|
|
# Timeout without image requests means text mode is working
|
|
has_images = False
|
|
|
|
logger.info(f"Text mode image blocking working: {not has_images}", tag="TEST")
|
|
|
|
# Clean up
|
|
await manager.close()
|
|
logger.info("Browser closed successfully", tag="TEST")
|
|
|
|
return True
|
|
except Exception as e:
|
|
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
|
# Ensure cleanup
|
|
try:
|
|
await manager.close()
|
|
except:
|
|
pass
|
|
return False
|
|
|
|
async def test_playwright_context_reuse():
|
|
"""Test context caching and reuse with identical configurations."""
|
|
logger.info("Testing context reuse with identical configurations", tag="TEST")
|
|
|
|
# Create browser config
|
|
browser_config = BrowserConfig(headless=True)
|
|
|
|
# Create browser manager
|
|
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
|
|
|
try:
|
|
# Start the browser
|
|
await manager.start()
|
|
logger.info("Browser started successfully", tag="TEST")
|
|
|
|
# Create identical crawler configs
|
|
crawler_config1 = CrawlerRunConfig(
|
|
css_selector="body",
|
|
)
|
|
|
|
crawler_config2 = CrawlerRunConfig(
|
|
css_selector="body",
|
|
)
|
|
|
|
# Get pages with these configs
|
|
page1, context1 = await manager.get_page(crawler_config1)
|
|
page2, context2 = await manager.get_page(crawler_config2)
|
|
|
|
# Check if contexts are reused
|
|
is_same_context = context1 == context2
|
|
logger.info(f"Contexts reused: {is_same_context}", tag="TEST")
|
|
|
|
# Now try with a different config
|
|
crawler_config3 = CrawlerRunConfig()
|
|
|
|
page3, context3 = await manager.get_page(crawler_config3)
|
|
|
|
# This should be a different context
|
|
is_different_context = context1 != context3
|
|
logger.info(f"Different contexts for different configs: {is_different_context}", tag="TEST")
|
|
|
|
# Clean up
|
|
await manager.close()
|
|
logger.info("Browser closed successfully", tag="TEST")
|
|
|
|
# Both tests should pass for success
|
|
return is_same_context and is_different_context
|
|
except Exception as e:
|
|
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
|
# Ensure cleanup
|
|
try:
|
|
await manager.close()
|
|
except:
|
|
pass
|
|
return False
|
|
|
|
async def test_playwright_session_management():
|
|
"""Test session management with Playwright browser."""
|
|
logger.info("Testing session management with Playwright browser", tag="TEST")
|
|
|
|
browser_config = BrowserConfig(
|
|
headless=True
|
|
)
|
|
|
|
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
|
|
|
try:
|
|
await manager.start()
|
|
logger.info("Browser launched successfully", tag="TEST")
|
|
|
|
# Create two sessions
|
|
session1_id = "playwright_session_1"
|
|
session2_id = "playwright_session_2"
|
|
|
|
# Set up first session
|
|
crawler_config1 = CrawlerRunConfig(session_id=session1_id, url="https://example.com")
|
|
page1, context1 = await manager.get_page(crawler_config1)
|
|
await page1.goto("https://example.com")
|
|
await page1.evaluate("localStorage.setItem('playwright_session1_data', 'test_value1')")
|
|
logger.info(f"Set up session 1 with ID: {session1_id}", tag="TEST")
|
|
|
|
# Set up second session
|
|
crawler_config2 = CrawlerRunConfig(session_id=session2_id, url="https://example.org")
|
|
page2, context2 = await manager.get_page(crawler_config2)
|
|
await page2.goto("https://example.org")
|
|
await page2.evaluate("localStorage.setItem('playwright_session2_data', 'test_value2')")
|
|
logger.info(f"Set up session 2 with ID: {session2_id}", tag="TEST")
|
|
|
|
# Get first session again
|
|
page1_again, context1_again = await manager.get_page(crawler_config1)
|
|
|
|
# Verify it's the same page and data persists
|
|
is_same_page = page1 == page1_again
|
|
is_same_context = context1 == context1_again
|
|
data1 = await page1_again.evaluate("localStorage.getItem('playwright_session1_data')")
|
|
logger.info(f"Session 1 reuse successful: {is_same_page}, data: {data1}", tag="TEST")
|
|
|
|
# Kill first session
|
|
await manager.kill_session(session1_id)
|
|
logger.info(f"Killed session 1", tag="TEST")
|
|
|
|
# Verify second session still works
|
|
data2 = await page2.evaluate("localStorage.getItem('playwright_session2_data')")
|
|
logger.info(f"Session 2 still functional after killing session 1, data: {data2}", tag="TEST")
|
|
|
|
# Clean up
|
|
await manager.close()
|
|
logger.info("Browser closed successfully", tag="TEST")
|
|
|
|
return is_same_page and is_same_context and data1 == "test_value1" and data2 == "test_value2"
|
|
except Exception as e:
|
|
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
|
try:
|
|
await manager.close()
|
|
except:
|
|
pass
|
|
return False
|
|
|
|
async def run_tests():
|
|
"""Run all tests sequentially."""
|
|
results = []
|
|
|
|
# results.append(await test_start_close())
|
|
# results.append(await test_playwright_basic())
|
|
# results.append(await test_playwright_text_mode())
|
|
# results.append(await test_playwright_context_reuse())
|
|
results.append(await test_playwright_session_management())
|
|
|
|
# Print summary
|
|
total = len(results)
|
|
passed = sum(results)
|
|
logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY")
|
|
|
|
if passed == total:
|
|
logger.success("All tests passed!", tag="SUMMARY")
|
|
else:
|
|
logger.error(f"{total - passed} tests failed", tag="SUMMARY")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(run_tests())
|