crawl4ai/docs/examples/markdown/content_source_example.py

"""
Example showing how to use the content_source parameter to control HTML input for markdown generation.
"""
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator

async def demo_content_source():
    """Demonstrates different content_source options for markdown generation."""
    url = "https://example.com"  # Simple demo site

    print("Crawling with different content_source options...")

    # --- Example 1: Default Behavior (cleaned_html) ---
    # This uses the HTML after it has been processed by the scraping strategy
    # The HTML is cleaned, simplified, and optimized for readability
    default_generator = DefaultMarkdownGenerator()  # content_source="cleaned_html" is default
    default_config = CrawlerRunConfig(markdown_generator=default_generator)

    # --- Example 2: Raw HTML ---
    # This uses the original HTML directly from the webpage
    # Preserves more original content but may include navigation, ads, etc.
    raw_generator = DefaultMarkdownGenerator(content_source="raw_html")
    raw_config = CrawlerRunConfig(markdown_generator=raw_generator)

    # --- Example 3: Fit HTML ---
    # This uses preprocessed HTML optimized for schema extraction
    # Better for structured data extraction but may lose some formatting
    fit_generator = DefaultMarkdownGenerator(content_source="fit_html")
    fit_config = CrawlerRunConfig(markdown_generator=fit_generator)

    # Execute all three crawlers in sequence
    async with AsyncWebCrawler() as crawler:
        # Default (cleaned_html)
        result_default = await crawler.arun(url=url, config=default_config)

        # Raw HTML
        result_raw = await crawler.arun(url=url, config=raw_config)

        # Fit HTML
        result_fit = await crawler.arun(url=url, config=fit_config)

    # Print a summary of the results
    print("\nMarkdown Generation Results:\n")

    print("1. Default (cleaned_html):")
    print(f"   Length: {len(result_default.markdown.raw_markdown)} chars")
    print(f"   First 80 chars: {result_default.markdown.raw_markdown[:80]}...\n")

    print("2. Raw HTML:")
    print(f"   Length: {len(result_raw.markdown.raw_markdown)} chars")
    print(f"   First 80 chars: {result_raw.markdown.raw_markdown[:80]}...\n")

    print("3. Fit HTML:")
    print(f"   Length: {len(result_fit.markdown.raw_markdown)} chars")
    print(f"   First 80 chars: {result_fit.markdown.raw_markdown[:80]}...\n")

    # Demonstrate differences in output
    print("\nKey Takeaways:")
    print("- cleaned_html: Best for readable, focused content")
    print("- raw_html: Preserves more original content, but may include noise")
    print("- fit_html: Optimized for schema extraction and structured data")

if __name__ == "__main__":
    asyncio.run(demo_content_source())