
Implement comprehensive network request and console message capturing functionality: - Add capture_network_requests and capture_console_messages config parameters - Add network_requests and console_messages fields to models - Implement Playwright event listeners to capture requests, responses, and console output - Create detailed documentation and examples - Add comprehensive tests This feature enables deep visibility into web page activity for debugging, security analysis, performance profiling, and API discovery in web applications.
214 lines
9.1 KiB
Python
214 lines
9.1 KiB
Python
# test_mhtml_capture.py
|
|
|
|
import pytest
|
|
import asyncio
|
|
import re # For more robust MHTML checks
|
|
|
|
# Assuming these can be imported directly from the crawl4ai library
|
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CrawlResult
|
|
|
|
# A reliable, simple static HTML page for testing
|
|
# Using httpbin as it's designed for testing clients
|
|
TEST_URL_SIMPLE = "https://httpbin.org/html"
|
|
EXPECTED_CONTENT_SIMPLE = "Herman Melville - Moby-Dick"
|
|
|
|
# A slightly more complex page that might involve JS (good secondary test)
|
|
TEST_URL_JS = "https://quotes.toscrape.com/js/"
|
|
EXPECTED_CONTENT_JS = "Quotes to Scrape" # Title of the page, which should be present in MHTML
|
|
|
|
# Removed the custom event_loop fixture as pytest-asyncio provides a default one.
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_mhtml_capture_when_enabled():
|
|
"""
|
|
Verify that when CrawlerRunConfig has capture_mhtml=True,
|
|
the CrawlResult contains valid MHTML content.
|
|
"""
|
|
# Create a fresh browser config and crawler instance for this test
|
|
browser_config = BrowserConfig(headless=True) # Use headless for testing CI/CD
|
|
# --- Key: Enable MHTML capture in the run config ---
|
|
run_config = CrawlerRunConfig(capture_mhtml=True)
|
|
|
|
# Create a fresh crawler instance
|
|
crawler = AsyncWebCrawler(config=browser_config)
|
|
|
|
try:
|
|
# Start the browser
|
|
await crawler.start()
|
|
|
|
# Perform the crawl with the MHTML-enabled config
|
|
result: CrawlResult = await crawler.arun(TEST_URL_SIMPLE, config=run_config)
|
|
|
|
# --- Assertions ---
|
|
assert result is not None, "Crawler should return a result object"
|
|
assert result.success is True, f"Crawling {TEST_URL_SIMPLE} should succeed. Error: {result.error_message}"
|
|
|
|
# 1. Check if the mhtml attribute exists (will fail if CrawlResult not updated)
|
|
assert hasattr(result, 'mhtml'), "CrawlResult object must have an 'mhtml' attribute"
|
|
|
|
# 2. Check if mhtml is populated
|
|
assert result.mhtml is not None, "MHTML content should be captured when enabled"
|
|
assert isinstance(result.mhtml, str), "MHTML content should be a string"
|
|
assert len(result.mhtml) > 500, "MHTML content seems too short, likely invalid" # Basic sanity check
|
|
|
|
# 3. Check for MHTML structure indicators (more robust than simple string contains)
|
|
# MHTML files are multipart MIME messages
|
|
assert re.search(r"Content-Type: multipart/related;", result.mhtml, re.IGNORECASE), \
|
|
"MHTML should contain 'Content-Type: multipart/related;'"
|
|
# Should contain a boundary definition
|
|
assert re.search(r"boundary=\"----MultipartBoundary", result.mhtml), \
|
|
"MHTML should contain a multipart boundary"
|
|
# Should contain the main HTML part
|
|
assert re.search(r"Content-Type: text/html", result.mhtml, re.IGNORECASE), \
|
|
"MHTML should contain a 'Content-Type: text/html' part"
|
|
|
|
# 4. Check if the *actual page content* is within the MHTML string
|
|
# This confirms the snapshot captured the rendered page
|
|
assert EXPECTED_CONTENT_SIMPLE in result.mhtml, \
|
|
f"Expected content '{EXPECTED_CONTENT_SIMPLE}' not found within the captured MHTML"
|
|
|
|
# 5. Ensure standard HTML is still present and correct
|
|
assert result.html is not None, "Standard HTML should still be present"
|
|
assert isinstance(result.html, str), "Standard HTML should be a string"
|
|
assert EXPECTED_CONTENT_SIMPLE in result.html, \
|
|
f"Expected content '{EXPECTED_CONTENT_SIMPLE}' not found within the standard HTML"
|
|
|
|
finally:
|
|
# Important: Ensure browser is completely closed even if assertions fail
|
|
await crawler.close()
|
|
# Help the garbage collector clean up
|
|
crawler = None
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_mhtml_capture_when_disabled_explicitly():
|
|
"""
|
|
Verify that when CrawlerRunConfig explicitly has capture_mhtml=False,
|
|
the CrawlResult.mhtml attribute is None.
|
|
"""
|
|
# Create a fresh browser config and crawler instance for this test
|
|
browser_config = BrowserConfig(headless=True)
|
|
# --- Key: Explicitly disable MHTML capture ---
|
|
run_config = CrawlerRunConfig(capture_mhtml=False)
|
|
|
|
# Create a fresh crawler instance
|
|
crawler = AsyncWebCrawler(config=browser_config)
|
|
|
|
try:
|
|
# Start the browser
|
|
await crawler.start()
|
|
result: CrawlResult = await crawler.arun(TEST_URL_SIMPLE, config=run_config)
|
|
|
|
assert result is not None
|
|
assert result.success is True, f"Crawling {TEST_URL_SIMPLE} should succeed. Error: {result.error_message}"
|
|
|
|
# 1. Check attribute existence (important for TDD start)
|
|
assert hasattr(result, 'mhtml'), "CrawlResult object must have an 'mhtml' attribute"
|
|
|
|
# 2. Check mhtml is None
|
|
assert result.mhtml is None, "MHTML content should be None when explicitly disabled"
|
|
|
|
# 3. Ensure standard HTML is still present
|
|
assert result.html is not None
|
|
assert EXPECTED_CONTENT_SIMPLE in result.html
|
|
|
|
finally:
|
|
# Important: Ensure browser is completely closed even if assertions fail
|
|
await crawler.close()
|
|
# Help the garbage collector clean up
|
|
crawler = None
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_mhtml_capture_when_disabled_by_default():
|
|
"""
|
|
Verify that if capture_mhtml is not specified (using its default),
|
|
the CrawlResult.mhtml attribute is None.
|
|
(This assumes the default value for capture_mhtml in CrawlerRunConfig is False)
|
|
"""
|
|
# Create a fresh browser config and crawler instance for this test
|
|
browser_config = BrowserConfig(headless=True)
|
|
# --- Key: Use default run config ---
|
|
run_config = CrawlerRunConfig() # Do not specify capture_mhtml
|
|
|
|
# Create a fresh crawler instance
|
|
crawler = AsyncWebCrawler(config=browser_config)
|
|
|
|
try:
|
|
# Start the browser
|
|
await crawler.start()
|
|
result: CrawlResult = await crawler.arun(TEST_URL_SIMPLE, config=run_config)
|
|
|
|
assert result is not None
|
|
assert result.success is True, f"Crawling {TEST_URL_SIMPLE} should succeed. Error: {result.error_message}"
|
|
|
|
# 1. Check attribute existence
|
|
assert hasattr(result, 'mhtml'), "CrawlResult object must have an 'mhtml' attribute"
|
|
|
|
# 2. Check mhtml is None (assuming default is False)
|
|
assert result.mhtml is None, "MHTML content should be None when using default config (assuming default=False)"
|
|
|
|
# 3. Ensure standard HTML is still present
|
|
assert result.html is not None
|
|
assert EXPECTED_CONTENT_SIMPLE in result.html
|
|
|
|
finally:
|
|
# Important: Ensure browser is completely closed even if assertions fail
|
|
await crawler.close()
|
|
# Help the garbage collector clean up
|
|
crawler = None
|
|
|
|
# Optional: Add a test for a JS-heavy page if needed
|
|
@pytest.mark.asyncio
|
|
async def test_mhtml_capture_on_js_page_when_enabled():
|
|
"""
|
|
Verify MHTML capture works on a page requiring JavaScript execution.
|
|
"""
|
|
# Create a fresh browser config and crawler instance for this test
|
|
browser_config = BrowserConfig(headless=True)
|
|
run_config = CrawlerRunConfig(
|
|
capture_mhtml=True,
|
|
# Add a small wait or JS execution if needed for the JS page to fully render
|
|
# For quotes.toscrape.com/js/, it renders quickly, but a wait might be safer
|
|
# wait_for_timeout=2000 # Example: wait up to 2 seconds
|
|
js_code="await new Promise(r => setTimeout(r, 500));" # Small delay after potential load
|
|
)
|
|
|
|
# Create a fresh crawler instance
|
|
crawler = AsyncWebCrawler(config=browser_config)
|
|
|
|
try:
|
|
# Start the browser
|
|
await crawler.start()
|
|
result: CrawlResult = await crawler.arun(TEST_URL_JS, config=run_config)
|
|
|
|
assert result is not None
|
|
assert result.success is True, f"Crawling {TEST_URL_JS} should succeed. Error: {result.error_message}"
|
|
assert hasattr(result, 'mhtml'), "CrawlResult object must have an 'mhtml' attribute"
|
|
assert result.mhtml is not None, "MHTML content should be captured on JS page when enabled"
|
|
assert isinstance(result.mhtml, str), "MHTML content should be a string"
|
|
assert len(result.mhtml) > 500, "MHTML content from JS page seems too short"
|
|
|
|
# Check for MHTML structure
|
|
assert re.search(r"Content-Type: multipart/related;", result.mhtml, re.IGNORECASE)
|
|
assert re.search(r"Content-Type: text/html", result.mhtml, re.IGNORECASE)
|
|
|
|
# Check for content rendered by JS within the MHTML
|
|
assert EXPECTED_CONTENT_JS in result.mhtml, \
|
|
f"Expected JS-rendered content '{EXPECTED_CONTENT_JS}' not found within the captured MHTML"
|
|
|
|
# Check standard HTML too
|
|
assert result.html is not None
|
|
assert EXPECTED_CONTENT_JS in result.html, \
|
|
f"Expected JS-rendered content '{EXPECTED_CONTENT_JS}' not found within the standard HTML"
|
|
|
|
finally:
|
|
# Important: Ensure browser is completely closed even if assertions fail
|
|
await crawler.close()
|
|
# Help the garbage collector clean up
|
|
crawler = None
|
|
|
|
if __name__ == "__main__":
|
|
# Use pytest for async tests
|
|
pytest.main(["-xvs", __file__])
|