# test_mhtml_capture.py
import pytest
import asyncio
import re # For more robust MHTML checks
# Assuming these can be imported directly from the crawl4ai library
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CrawlResult
# A reliable, simple static HTML page for testing
# Using httpbin as it's designed for testing clients
TEST_URL_SIMPLE = "https://httpbin.org/html"
EXPECTED_CONTENT_SIMPLE = "Herman Melville - Moby-Dick"
# A slightly more complex page that might involve JS (good secondary test)
TEST_URL_JS = "https://quotes.toscrape.com/js/"
EXPECTED_CONTENT_JS = "Quotes to Scrape" # Title of the page, which should be present in MHTML
# Removed the custom event_loop fixture as pytest-asyncio provides a default one.
@pytest.mark.asyncio
async def test_mhtml_capture_when_enabled():
"""
Verify that when CrawlerRunConfig has capture_mhtml=True,
the CrawlResult contains valid MHTML content.
"""
# Create a fresh browser config and crawler instance for this test
browser_config = BrowserConfig(headless=True) # Use headless for testing CI/CD
# --- Key: Enable MHTML capture in the run config ---
run_config = CrawlerRunConfig(capture_mhtml=True)
# Create a fresh crawler instance
crawler = AsyncWebCrawler(config=browser_config)
try:
# Start the browser
await crawler.start()
# Perform the crawl with the MHTML-enabled config
result: CrawlResult = await crawler.arun(TEST_URL_SIMPLE, config=run_config)
# --- Assertions ---
assert result is not None, "Crawler should return a result object"
assert result.success is True, f"Crawling {TEST_URL_SIMPLE} should succeed. Error: {result.error_message}"
# 1. Check if the mhtml attribute exists (will fail if CrawlResult not updated)
assert hasattr(result, 'mhtml'), "CrawlResult object must have an 'mhtml' attribute"
# 2. Check if mhtml is populated
assert result.mhtml is not None, "MHTML content should be captured when enabled"
assert isinstance(result.mhtml, str), "MHTML content should be a string"
assert len(result.mhtml) > 500, "MHTML content seems too short, likely invalid" # Basic sanity check
# 3. Check for MHTML structure indicators (more robust than simple string contains)
# MHTML files are multipart MIME messages
assert re.search(r"Content-Type: multipart/related;", result.mhtml, re.IGNORECASE), \
"MHTML should contain 'Content-Type: multipart/related;'"
# Should contain a boundary definition
assert re.search(r"boundary=\"----MultipartBoundary", result.mhtml), \
"MHTML should contain a multipart boundary"
# Should contain the main HTML part
assert re.search(r"Content-Type: text/html", result.mhtml, re.IGNORECASE), \
"MHTML should contain a 'Content-Type: text/html' part"
# 4. Check if the *actual page content* is within the MHTML string
# This confirms the snapshot captured the rendered page
assert EXPECTED_CONTENT_SIMPLE in result.mhtml, \
f"Expected content '{EXPECTED_CONTENT_SIMPLE}' not found within the captured MHTML"
# 5. Ensure standard HTML is still present and correct
assert result.html is not None, "Standard HTML should still be present"
assert isinstance(result.html, str), "Standard HTML should be a string"
assert EXPECTED_CONTENT_SIMPLE in result.html, \
f"Expected content '{EXPECTED_CONTENT_SIMPLE}' not found within the standard HTML"
finally:
# Important: Ensure browser is completely closed even if assertions fail
await crawler.close()
# Help the garbage collector clean up
crawler = None
@pytest.mark.asyncio
async def test_mhtml_capture_when_disabled_explicitly():
"""
Verify that when CrawlerRunConfig explicitly has capture_mhtml=False,
the CrawlResult.mhtml attribute is None.
"""
# Create a fresh browser config and crawler instance for this test
browser_config = BrowserConfig(headless=True)
# --- Key: Explicitly disable MHTML capture ---
run_config = CrawlerRunConfig(capture_mhtml=False)
# Create a fresh crawler instance
crawler = AsyncWebCrawler(config=browser_config)
try:
# Start the browser
await crawler.start()
result: CrawlResult = await crawler.arun(TEST_URL_SIMPLE, config=run_config)
assert result is not None
assert result.success is True, f"Crawling {TEST_URL_SIMPLE} should succeed. Error: {result.error_message}"
# 1. Check attribute existence (important for TDD start)
assert hasattr(result, 'mhtml'), "CrawlResult object must have an 'mhtml' attribute"
# 2. Check mhtml is None
assert result.mhtml is None, "MHTML content should be None when explicitly disabled"
# 3. Ensure standard HTML is still present
assert result.html is not None
assert EXPECTED_CONTENT_SIMPLE in result.html
finally:
# Important: Ensure browser is completely closed even if assertions fail
await crawler.close()
# Help the garbage collector clean up
crawler = None
@pytest.mark.asyncio
async def test_mhtml_capture_when_disabled_by_default():
"""
Verify that if capture_mhtml is not specified (using its default),
the CrawlResult.mhtml attribute is None.
(This assumes the default value for capture_mhtml in CrawlerRunConfig is False)
"""
# Create a fresh browser config and crawler instance for this test
browser_config = BrowserConfig(headless=True)
# --- Key: Use default run config ---
run_config = CrawlerRunConfig() # Do not specify capture_mhtml
# Create a fresh crawler instance
crawler = AsyncWebCrawler(config=browser_config)
try:
# Start the browser
await crawler.start()
result: CrawlResult = await crawler.arun(TEST_URL_SIMPLE, config=run_config)
assert result is not None
assert result.success is True, f"Crawling {TEST_URL_SIMPLE} should succeed. Error: {result.error_message}"
# 1. Check attribute existence
assert hasattr(result, 'mhtml'), "CrawlResult object must have an 'mhtml' attribute"
# 2. Check mhtml is None (assuming default is False)
assert result.mhtml is None, "MHTML content should be None when using default config (assuming default=False)"
# 3. Ensure standard HTML is still present
assert result.html is not None
assert EXPECTED_CONTENT_SIMPLE in result.html
finally:
# Important: Ensure browser is completely closed even if assertions fail
await crawler.close()
# Help the garbage collector clean up
crawler = None
# Optional: Add a test for a JS-heavy page if needed
@pytest.mark.asyncio
async def test_mhtml_capture_on_js_page_when_enabled():
"""
Verify MHTML capture works on a page requiring JavaScript execution.
"""
# Create a fresh browser config and crawler instance for this test
browser_config = BrowserConfig(headless=True)
run_config = CrawlerRunConfig(
capture_mhtml=True,
# Add a small wait or JS execution if needed for the JS page to fully render
# For quotes.toscrape.com/js/, it renders quickly, but a wait might be safer
# wait_for_timeout=2000 # Example: wait up to 2 seconds
js_code="await new Promise(r => setTimeout(r, 500));" # Small delay after potential load
)
# Create a fresh crawler instance
crawler = AsyncWebCrawler(config=browser_config)
try:
# Start the browser
await crawler.start()
result: CrawlResult = await crawler.arun(TEST_URL_JS, config=run_config)
assert result is not None
assert result.success is True, f"Crawling {TEST_URL_JS} should succeed. Error: {result.error_message}"
assert hasattr(result, 'mhtml'), "CrawlResult object must have an 'mhtml' attribute"
assert result.mhtml is not None, "MHTML content should be captured on JS page when enabled"
assert isinstance(result.mhtml, str), "MHTML content should be a string"
assert len(result.mhtml) > 500, "MHTML content from JS page seems too short"
# Check for MHTML structure
assert re.search(r"Content-Type: multipart/related;", result.mhtml, re.IGNORECASE)
assert re.search(r"Content-Type: text/html", result.mhtml, re.IGNORECASE)
# Check for content rendered by JS within the MHTML
assert EXPECTED_CONTENT_JS in result.mhtml, \
f"Expected JS-rendered content '{EXPECTED_CONTENT_JS}' not found within the captured MHTML"
# Check standard HTML too
assert result.html is not None
assert EXPECTED_CONTENT_JS in result.html, \
f"Expected JS-rendered content '{EXPECTED_CONTENT_JS}' not found within the standard HTML"
finally:
# Important: Ensure browser is completely closed even if assertions fail
await crawler.close()
# Help the garbage collector clean up
crawler = None
if __name__ == "__main__":
# Use pytest for async tests
pytest.main(["-xvs", __file__])