crawl4ai/tests/general/test_mhtml.py

# test_mhtml_capture.py

import pytest
import asyncio
import re  # For more robust MHTML checks

# Assuming these can be imported directly from the crawl4ai library
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CrawlResult

# A reliable, simple static HTML page for testing
# Using httpbin as it's designed for testing clients
TEST_URL_SIMPLE = "https://httpbin.org/html"
EXPECTED_CONTENT_SIMPLE = "Herman Melville - Moby-Dick"

# A slightly more complex page that might involve JS (good secondary test)
TEST_URL_JS = "https://quotes.toscrape.com/js/"
EXPECTED_CONTENT_JS = "Quotes to Scrape" # Title of the page, which should be present in MHTML

# Removed the custom event_loop fixture as pytest-asyncio provides a default one.

@pytest.mark.asyncio
async def test_mhtml_capture_when_enabled():
    """
    Verify that when CrawlerRunConfig has capture_mhtml=True,
    the CrawlResult contains valid MHTML content.
    """
    # Create a fresh browser config and crawler instance for this test
    browser_config = BrowserConfig(headless=True) # Use headless for testing CI/CD
    # --- Key: Enable MHTML capture in the run config ---
    run_config = CrawlerRunConfig(capture_mhtml=True)

    # Create a fresh crawler instance
    crawler = AsyncWebCrawler(config=browser_config)

    try:
        # Start the browser
        await crawler.start()
        
        # Perform the crawl with the MHTML-enabled config
        result: CrawlResult = await crawler.arun(TEST_URL_SIMPLE, config=run_config)

        # --- Assertions ---
        assert result is not None, "Crawler should return a result object"
        assert result.success is True, f"Crawling {TEST_URL_SIMPLE} should succeed. Error: {result.error_message}"

        # 1. Check if the mhtml attribute exists (will fail if CrawlResult not updated)
        assert hasattr(result, 'mhtml'), "CrawlResult object must have an 'mhtml' attribute"

        # 2. Check if mhtml is populated
        assert result.mhtml is not None, "MHTML content should be captured when enabled"
        assert isinstance(result.mhtml, str), "MHTML content should be a string"
        assert len(result.mhtml) > 500, "MHTML content seems too short, likely invalid" # Basic sanity check

        # 3. Check for MHTML structure indicators (more robust than simple string contains)
        # MHTML files are multipart MIME messages
        assert re.search(r"Content-Type: multipart/related;", result.mhtml, re.IGNORECASE), \
            "MHTML should contain 'Content-Type: multipart/related;'"
        # Should contain a boundary definition
        assert re.search(r"boundary=\"----MultipartBoundary", result.mhtml), \
            "MHTML should contain a multipart boundary"
        # Should contain the main HTML part
        assert re.search(r"Content-Type: text/html", result.mhtml, re.IGNORECASE), \
            "MHTML should contain a 'Content-Type: text/html' part"

        # 4. Check if the *actual page content* is within the MHTML string
        # This confirms the snapshot captured the rendered page
        assert EXPECTED_CONTENT_SIMPLE in result.mhtml, \
            f"Expected content '{EXPECTED_CONTENT_SIMPLE}' not found within the captured MHTML"

        # 5. Ensure standard HTML is still present and correct
        assert result.html is not None, "Standard HTML should still be present"
        assert isinstance(result.html, str), "Standard HTML should be a string"
        assert EXPECTED_CONTENT_SIMPLE in result.html, \
            f"Expected content '{EXPECTED_CONTENT_SIMPLE}' not found within the standard HTML"

    finally:
        # Important: Ensure browser is completely closed even if assertions fail
        await crawler.close()
        # Help the garbage collector clean up
        crawler = None


@pytest.mark.asyncio
async def test_mhtml_capture_when_disabled_explicitly():
    """
    Verify that when CrawlerRunConfig explicitly has capture_mhtml=False,
    the CrawlResult.mhtml attribute is None.
    """
    # Create a fresh browser config and crawler instance for this test
    browser_config = BrowserConfig(headless=True)
    # --- Key: Explicitly disable MHTML capture ---
    run_config = CrawlerRunConfig(capture_mhtml=False)

    # Create a fresh crawler instance
    crawler = AsyncWebCrawler(config=browser_config)

    try:
        # Start the browser
        await crawler.start()
        result: CrawlResult = await crawler.arun(TEST_URL_SIMPLE, config=run_config)

        assert result is not None
        assert result.success is True, f"Crawling {TEST_URL_SIMPLE} should succeed. Error: {result.error_message}"

        # 1. Check attribute existence (important for TDD start)
        assert hasattr(result, 'mhtml'), "CrawlResult object must have an 'mhtml' attribute"

        # 2. Check mhtml is None
        assert result.mhtml is None, "MHTML content should be None when explicitly disabled"

        # 3. Ensure standard HTML is still present
        assert result.html is not None
        assert EXPECTED_CONTENT_SIMPLE in result.html

    finally:
        # Important: Ensure browser is completely closed even if assertions fail
        await crawler.close()
        # Help the garbage collector clean up
        crawler = None


@pytest.mark.asyncio
async def test_mhtml_capture_when_disabled_by_default():
    """
    Verify that if capture_mhtml is not specified (using its default),
    the CrawlResult.mhtml attribute is None.
    (This assumes the default value for capture_mhtml in CrawlerRunConfig is False)
    """
    # Create a fresh browser config and crawler instance for this test
    browser_config = BrowserConfig(headless=True)
    # --- Key: Use default run config ---
    run_config = CrawlerRunConfig() # Do not specify capture_mhtml

    # Create a fresh crawler instance
    crawler = AsyncWebCrawler(config=browser_config)

    try:
        # Start the browser
        await crawler.start()
        result: CrawlResult = await crawler.arun(TEST_URL_SIMPLE, config=run_config)

        assert result is not None
        assert result.success is True, f"Crawling {TEST_URL_SIMPLE} should succeed. Error: {result.error_message}"

        # 1. Check attribute existence
        assert hasattr(result, 'mhtml'), "CrawlResult object must have an 'mhtml' attribute"

        # 2. Check mhtml is None (assuming default is False)
        assert result.mhtml is None, "MHTML content should be None when using default config (assuming default=False)"

        # 3. Ensure standard HTML is still present
        assert result.html is not None
        assert EXPECTED_CONTENT_SIMPLE in result.html

    finally:
        # Important: Ensure browser is completely closed even if assertions fail
        await crawler.close()
        # Help the garbage collector clean up
        crawler = None

# Optional: Add a test for a JS-heavy page if needed
@pytest.mark.asyncio
async def test_mhtml_capture_on_js_page_when_enabled():
    """
    Verify MHTML capture works on a page requiring JavaScript execution.
    """
    # Create a fresh browser config and crawler instance for this test
    browser_config = BrowserConfig(headless=True)
    run_config = CrawlerRunConfig(
        capture_mhtml=True,
        # Add a small wait or JS execution if needed for the JS page to fully render
        # For quotes.toscrape.com/js/, it renders quickly, but a wait might be safer
        # wait_for_timeout=2000 # Example: wait up to 2 seconds
        js_code="await new Promise(r => setTimeout(r, 500));" # Small delay after potential load
    )

    # Create a fresh crawler instance
    crawler = AsyncWebCrawler(config=browser_config)

    try:
        # Start the browser
        await crawler.start()
        result: CrawlResult = await crawler.arun(TEST_URL_JS, config=run_config)

        assert result is not None
        assert result.success is True, f"Crawling {TEST_URL_JS} should succeed. Error: {result.error_message}"
        assert hasattr(result, 'mhtml'), "CrawlResult object must have an 'mhtml' attribute"
        assert result.mhtml is not None, "MHTML content should be captured on JS page when enabled"
        assert isinstance(result.mhtml, str), "MHTML content should be a string"
        assert len(result.mhtml) > 500, "MHTML content from JS page seems too short"

        # Check for MHTML structure
        assert re.search(r"Content-Type: multipart/related;", result.mhtml, re.IGNORECASE)
        assert re.search(r"Content-Type: text/html", result.mhtml, re.IGNORECASE)

        # Check for content rendered by JS within the MHTML
        assert EXPECTED_CONTENT_JS in result.mhtml, \
            f"Expected JS-rendered content '{EXPECTED_CONTENT_JS}' not found within the captured MHTML"

        # Check standard HTML too
        assert result.html is not None
        assert EXPECTED_CONTENT_JS in result.html, \
             f"Expected JS-rendered content '{EXPECTED_CONTENT_JS}' not found within the standard HTML"

    finally:
        # Important: Ensure browser is completely closed even if assertions fail
        await crawler.close()
        # Help the garbage collector clean up
        crawler = None

if __name__ == "__main__":
    # Use pytest for async tests
    pytest.main(["-xvs", __file__])
feat(crawler): add MHTML capture functionality Add ability to capture web pages as MHTML format, which includes all page resources in a single file. This enables complete page archival and offline viewing. - Add capture_mhtml parameter to CrawlerRunConfig - Implement MHTML capture using CDP in AsyncPlaywrightCrawlerStrategy - Add mhtml field to CrawlResult and AsyncCrawlResponse models - Add comprehensive tests for MHTML capture functionality - Update documentation with MHTML capture details - Add exclude_all_images option for better memory management Breaking changes: None 2025-04-09 15:39:04 +08:00			`# test_mhtml_capture.py`

			`import pytest`
			`import asyncio`
			`import re # For more robust MHTML checks`

			`# Assuming these can be imported directly from the crawl4ai library`
			`from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CrawlResult`

			`# A reliable, simple static HTML page for testing`
			`# Using httpbin as it's designed for testing clients`
			`TEST_URL_SIMPLE = "https://httpbin.org/html"`
			`EXPECTED_CONTENT_SIMPLE = "Herman Melville - Moby-Dick"`

			`# A slightly more complex page that might involve JS (good secondary test)`
			`TEST_URL_JS = "https://quotes.toscrape.com/js/"`
			`EXPECTED_CONTENT_JS = "Quotes to Scrape" # Title of the page, which should be present in MHTML`

			`# Removed the custom event_loop fixture as pytest-asyncio provides a default one.`

			`@pytest.mark.asyncio`
			`async def test_mhtml_capture_when_enabled():`
			`"""`
			`Verify that when CrawlerRunConfig has capture_mhtml=True,`
			`the CrawlResult contains valid MHTML content.`
			`"""`
			`# Create a fresh browser config and crawler instance for this test`
			`browser_config = BrowserConfig(headless=True) # Use headless for testing CI/CD`
			`# --- Key: Enable MHTML capture in the run config ---`
			`run_config = CrawlerRunConfig(capture_mhtml=True)`

			`# Create a fresh crawler instance`
			`crawler = AsyncWebCrawler(config=browser_config)`

			`try:`
			`# Start the browser`
			`await crawler.start()`

			`# Perform the crawl with the MHTML-enabled config`
			`result: CrawlResult = await crawler.arun(TEST_URL_SIMPLE, config=run_config)`

			`# --- Assertions ---`
			`assert result is not None, "Crawler should return a result object"`
			`assert result.success is True, f"Crawling {TEST_URL_SIMPLE} should succeed. Error: {result.error_message}"`

			`# 1. Check if the mhtml attribute exists (will fail if CrawlResult not updated)`
			`assert hasattr(result, 'mhtml'), "CrawlResult object must have an 'mhtml' attribute"`

			`# 2. Check if mhtml is populated`
			`assert result.mhtml is not None, "MHTML content should be captured when enabled"`
			`assert isinstance(result.mhtml, str), "MHTML content should be a string"`
			`assert len(result.mhtml) > 500, "MHTML content seems too short, likely invalid" # Basic sanity check`

			`# 3. Check for MHTML structure indicators (more robust than simple string contains)`
			`# MHTML files are multipart MIME messages`
			`assert re.search(r"Content-Type: multipart/related;", result.mhtml, re.IGNORECASE), \`
			`"MHTML should contain 'Content-Type: multipart/related;'"`
			`# Should contain a boundary definition`
			`assert re.search(r"boundary=\"----MultipartBoundary", result.mhtml), \`
			`"MHTML should contain a multipart boundary"`
			`# Should contain the main HTML part`
			`assert re.search(r"Content-Type: text/html", result.mhtml, re.IGNORECASE), \`
			`"MHTML should contain a 'Content-Type: text/html' part"`

			`# 4. Check if the actual page content is within the MHTML string`
			`# This confirms the snapshot captured the rendered page`
			`assert EXPECTED_CONTENT_SIMPLE in result.mhtml, \`
			`f"Expected content '{EXPECTED_CONTENT_SIMPLE}' not found within the captured MHTML"`

			`# 5. Ensure standard HTML is still present and correct`
			`assert result.html is not None, "Standard HTML should still be present"`
			`assert isinstance(result.html, str), "Standard HTML should be a string"`
			`assert EXPECTED_CONTENT_SIMPLE in result.html, \`
			`f"Expected content '{EXPECTED_CONTENT_SIMPLE}' not found within the standard HTML"`

			`finally:`
			`# Important: Ensure browser is completely closed even if assertions fail`
			`await crawler.close()`
			`# Help the garbage collector clean up`
			`crawler = None`


			`@pytest.mark.asyncio`
			`async def test_mhtml_capture_when_disabled_explicitly():`
			`"""`
			`Verify that when CrawlerRunConfig explicitly has capture_mhtml=False,`
			`the CrawlResult.mhtml attribute is None.`
			`"""`
			`# Create a fresh browser config and crawler instance for this test`
			`browser_config = BrowserConfig(headless=True)`
			`# --- Key: Explicitly disable MHTML capture ---`
			`run_config = CrawlerRunConfig(capture_mhtml=False)`

			`# Create a fresh crawler instance`
			`crawler = AsyncWebCrawler(config=browser_config)`

			`try:`
			`# Start the browser`
			`await crawler.start()`
			`result: CrawlResult = await crawler.arun(TEST_URL_SIMPLE, config=run_config)`

			`assert result is not None`
			`assert result.success is True, f"Crawling {TEST_URL_SIMPLE} should succeed. Error: {result.error_message}"`

			`# 1. Check attribute existence (important for TDD start)`
			`assert hasattr(result, 'mhtml'), "CrawlResult object must have an 'mhtml' attribute"`

			`# 2. Check mhtml is None`
			`assert result.mhtml is None, "MHTML content should be None when explicitly disabled"`

			`# 3. Ensure standard HTML is still present`
			`assert result.html is not None`
			`assert EXPECTED_CONTENT_SIMPLE in result.html`

			`finally:`
			`# Important: Ensure browser is completely closed even if assertions fail`
			`await crawler.close()`
			`# Help the garbage collector clean up`
			`crawler = None`


			`@pytest.mark.asyncio`
			`async def test_mhtml_capture_when_disabled_by_default():`
			`"""`
			`Verify that if capture_mhtml is not specified (using its default),`
			`the CrawlResult.mhtml attribute is None.`
			`(This assumes the default value for capture_mhtml in CrawlerRunConfig is False)`
			`"""`
			`# Create a fresh browser config and crawler instance for this test`
			`browser_config = BrowserConfig(headless=True)`
			`# --- Key: Use default run config ---`
			`run_config = CrawlerRunConfig() # Do not specify capture_mhtml`

			`# Create a fresh crawler instance`
			`crawler = AsyncWebCrawler(config=browser_config)`

			`try:`
			`# Start the browser`
			`await crawler.start()`
			`result: CrawlResult = await crawler.arun(TEST_URL_SIMPLE, config=run_config)`

			`assert result is not None`
			`assert result.success is True, f"Crawling {TEST_URL_SIMPLE} should succeed. Error: {result.error_message}"`

			`# 1. Check attribute existence`
			`assert hasattr(result, 'mhtml'), "CrawlResult object must have an 'mhtml' attribute"`

			`# 2. Check mhtml is None (assuming default is False)`
			`assert result.mhtml is None, "MHTML content should be None when using default config (assuming default=False)"`

			`# 3. Ensure standard HTML is still present`
			`assert result.html is not None`
			`assert EXPECTED_CONTENT_SIMPLE in result.html`

			`finally:`
			`# Important: Ensure browser is completely closed even if assertions fail`
			`await crawler.close()`
			`# Help the garbage collector clean up`
			`crawler = None`

			`# Optional: Add a test for a JS-heavy page if needed`
			`@pytest.mark.asyncio`
			`async def test_mhtml_capture_on_js_page_when_enabled():`
			`"""`
			`Verify MHTML capture works on a page requiring JavaScript execution.`
			`"""`
			`# Create a fresh browser config and crawler instance for this test`
			`browser_config = BrowserConfig(headless=True)`
			`run_config = CrawlerRunConfig(`
			`capture_mhtml=True,`
			`# Add a small wait or JS execution if needed for the JS page to fully render`
			`# For quotes.toscrape.com/js/, it renders quickly, but a wait might be safer`
			`# wait_for_timeout=2000 # Example: wait up to 2 seconds`
			`js_code="await new Promise(r => setTimeout(r, 500));" # Small delay after potential load`
			`)`

			`# Create a fresh crawler instance`
			`crawler = AsyncWebCrawler(config=browser_config)`

			`try:`
			`# Start the browser`
			`await crawler.start()`
			`result: CrawlResult = await crawler.arun(TEST_URL_JS, config=run_config)`

			`assert result is not None`
			`assert result.success is True, f"Crawling {TEST_URL_JS} should succeed. Error: {result.error_message}"`
			`assert hasattr(result, 'mhtml'), "CrawlResult object must have an 'mhtml' attribute"`
			`assert result.mhtml is not None, "MHTML content should be captured on JS page when enabled"`
			`assert isinstance(result.mhtml, str), "MHTML content should be a string"`
			`assert len(result.mhtml) > 500, "MHTML content from JS page seems too short"`

			`# Check for MHTML structure`
			`assert re.search(r"Content-Type: multipart/related;", result.mhtml, re.IGNORECASE)`
			`assert re.search(r"Content-Type: text/html", result.mhtml, re.IGNORECASE)`

			`# Check for content rendered by JS within the MHTML`
			`assert EXPECTED_CONTENT_JS in result.mhtml, \`
			`f"Expected JS-rendered content '{EXPECTED_CONTENT_JS}' not found within the captured MHTML"`

			`# Check standard HTML too`
			`assert result.html is not None`
			`assert EXPECTED_CONTENT_JS in result.html, \`
			`f"Expected JS-rendered content '{EXPECTED_CONTENT_JS}' not found within the standard HTML"`

			`finally:`
			`# Important: Ensure browser is completely closed even if assertions fail`
			`await crawler.close()`
			`# Help the garbage collector clean up`
			`crawler = None`

			`if __name__ == "__main__":`
			`# Use pytest for async tests`
			`pytest.main(["-xvs", __file__])`