crawl4ai/docs/examples/llm_markdown_generator.py

import os
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai import LLMConfig
from crawl4ai.content_filter_strategy import LLMContentFilter

async def test_llm_filter():
    # Create an HTML source that needs intelligent filtering
    url = "https://docs.python.org/3/tutorial/classes.html"

    browser_config = BrowserConfig(
        headless=True,
        verbose=True
    )

    # run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
    run_config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)

    async with AsyncWebCrawler(config=browser_config) as crawler:
        # First get the raw HTML
        result = await crawler.arun(url, config=run_config)
        html = result.cleaned_html

        # Initialize LLM filter with focused instruction
        filter = LLMContentFilter(
            llm_config=LLMConfig(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY')),
            instruction="""
            Focus on extracting the core educational content about Python classes.
            Include:
            - Key concepts and their explanations
            - Important code examples
            - Essential technical details
            Exclude:
            - Navigation elements
            - Sidebars
            - Footer content
            - Version information
            - Any non-essential UI elements

            Format the output as clean markdown with proper code blocks and headers.
            """,
            verbose=True
        )

        filter = LLMContentFilter(
            llm_config=LLMConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')),
            chunk_token_threshold=2 ** 12 * 2, # 2048 * 2
            ignore_cache = True,
            instruction="""
            Extract the main educational content while preserving its original wording and substance completely. Your task is to:

            1. Maintain the exact language and terminology used in the main content
            2. Keep all technical explanations, examples, and educational content intact
            3. Preserve the original flow and structure of the core content
            4. Remove only clearly irrelevant elements like:
            - Navigation menus
            - Advertisement sections
            - Cookie notices
            - Footers with site information
            - Sidebars with external links
            - Any UI elements that don't contribute to learning

            The goal is to create a clean markdown version that reads exactly like the original article,
            keeping all valuable content but free from distracting elements. Imagine you're creating
            a perfect reading experience where nothing valuable is lost, but all noise is removed.
            """,
            verbose=True
        )

        # Apply filtering
        filtered_content = filter.filter_content(html)

        # Show results
        print("\nFiltered Content Length:", len(filtered_content))
        print("\nFirst 500 chars of filtered content:")
        if filtered_content:
            print(filtered_content[0][:500])

        # Save on disc the markdown version
        with open("filtered_content.md", "w", encoding="utf-8") as f:
            f.write("\n".join(filtered_content))

        # Show token usage
        filter.show_usage()

if __name__ == "__main__":
    asyncio.run(test_llm_filter())