Merge branch 'main' into next

2025-04-08 17:43:42 +08:00 · 2025-04-08 17:43:42 +08:00 · 9038e9acbd
commit 9038e9acbd
parent 02e627e0bd e1d9e2489c
4 changed files with 1010 additions and 1 deletions
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@ -0,0 +1,35 @@
+name: Discord GitHub Notifications
+
+on:
+  issues:
+    types: [opened]
+  issue_comment:
+    types: [created]
+  pull_request:
+    types: [opened]
+  discussion:
+    types: [created]
+
+jobs:
+  notify-discord:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Set webhook based on event type
+        id: set-webhook
+        run: |
+          if [ "${{ github.event_name }}" == "discussion" ]; then
+            echo "webhook=${{ secrets.DISCORD_DISCUSSIONS_WEBHOOK }}" >> $GITHUB_OUTPUT
+          else
+            echo "webhook=${{ secrets.DISCORD_WEBHOOK }}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Discord Notification
+        uses: Ilshidur/action-discord@master
+        env:
+          DISCORD_WEBHOOK: ${{ steps.set-webhook.outputs.webhook }}
+        with:
+          args: |
+            ${{ github.event_name == 'issues' && format('📣 New issue created: **{0}** by {1} - {2}', github.event.issue.title, github.event.issue.user.login, github.event.issue.html_url) || 
+            github.event_name == 'issue_comment' && format('💬 New comment on issue **{0}** by {1} - {2}', github.event.issue.title, github.event.comment.user.login, github.event.comment.html_url) ||
+            github.event_name == 'pull_request' && format('🔄 New PR opened: **{0}** by {1} - {2}', github.event.pull_request.title, github.event.pull_request.user.login, github.event.pull_request.html_url) || 
+            format('💬 New discussion started: **{0}** by {1} - {2}', github.event.discussion.title, github.event.discussion.user.login, github.event.discussion.html_url) }}
--- a/deploy/docker/config.yml
+++ b/deploy/docker/config.yml
@ -68,4 +68,4 @@ observability:
    enabled: True
    endpoint: "/metrics"
  health_check:
-    endpoint: "/health"
+    endpoint: "/health"
--- a/docs/examples/quickstart_examples_set_1.py
+++ b/docs/examples/quickstart_examples_set_1.py
@ -0,0 +1,412 @@
+import asyncio
+import os
+import json
+import base64
+from pathlib import Path
+from typing import List
+from crawl4ai.proxy_strategy import ProxyConfig
+
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, CrawlResult
+from crawl4ai import RoundRobinProxyStrategy
+from crawl4ai import JsonCssExtractionStrategy, LLMExtractionStrategy
+from crawl4ai import LLMConfig
+from crawl4ai import PruningContentFilter, BM25ContentFilter
+from crawl4ai import DefaultMarkdownGenerator
+from crawl4ai import BFSDeepCrawlStrategy, DomainFilter, FilterChain
+from crawl4ai import BrowserConfig
+
+__cur_dir__ = Path(__file__).parent
+
+async def demo_basic_crawl():
+    """Basic web crawling with markdown generation"""
+    print("\n=== 1. Basic Web Crawling ===")
+    async with AsyncWebCrawler(config = BrowserConfig(
+        viewport_height=800,
+        viewport_width=1200,
+        headless=True,
+        verbose=True,
+    )) as crawler:
+        results: List[CrawlResult] = await crawler.arun(
+            url="https://news.ycombinator.com/"
+        )
+
+        for i, result in enumerate(results):
+            print(f"Result {i + 1}:")
+            print(f"Success: {result.success}")
+            if result.success:
+                print(f"Markdown length: {len(result.markdown.raw_markdown)} chars")
+                print(f"First 100 chars: {result.markdown.raw_markdown[:100]}...")
+            else:
+                print("Failed to crawl the URL")
+
+async def demo_parallel_crawl():
+    """Crawl multiple URLs in parallel"""
+    print("\n=== 2. Parallel Crawling ===")
+
+    urls = [
+        "https://news.ycombinator.com/",
+        "https://example.com/",
+        "https://httpbin.org/html",
+    ]
+
+    async with AsyncWebCrawler() as crawler:
+        results: List[CrawlResult] = await crawler.arun_many(
+            urls=urls,
+        )
+
+        print(f"Crawled {len(results)} URLs in parallel:")
+        for i, result in enumerate(results):
+            print(
+                f"  {i + 1}. {result.url} - {'Success' if result.success else 'Failed'}"
+            )
+
+async def demo_fit_markdown():
+    """Generate focused markdown with LLM content filter"""
+    print("\n=== 3. Fit Markdown with LLM Content Filter ===")
+
+    async with AsyncWebCrawler() as crawler:
+        result: CrawlResult = await crawler.arun(
+            url = "https://en.wikipedia.org/wiki/Python_(programming_language)",
+            config=CrawlerRunConfig(
+                markdown_generator=DefaultMarkdownGenerator(
+                    content_filter=PruningContentFilter()
+                )
+            ),
+        )
+
+        # Print stats and save the fit markdown
+        print(f"Raw: {len(result.markdown.raw_markdown)} chars")
+        print(f"Fit: {len(result.markdown.fit_markdown)} chars")
+
+async def demo_llm_structured_extraction_no_schema():
+    # Create a simple LLM extraction strategy (no schema required)
+    extraction_strategy = LLMExtractionStrategy(
+        llm_config=LLMConfig(
+            provider="groq/qwen-2.5-32b",
+            api_token="env:GROQ_API_KEY",
+        ),
+        instruction="This is news.ycombinator.com, extract all news, and for each, I want title, source url, number of comments.",
+        extract_type="schema",
+        schema="{title: string, url: string, comments: int}",
+        extra_args={
+            "temperature": 0.0,
+            "max_tokens": 4096,
+        },
+        verbose=True,
+    )
+
+    config = CrawlerRunConfig(extraction_strategy=extraction_strategy)
+
+    async with AsyncWebCrawler() as crawler:
+        results: List[CrawlResult] = await crawler.arun(
+            "https://news.ycombinator.com/", config=config
+        )
+
+        for result in results:
+            print(f"URL: {result.url}")
+            print(f"Success: {result.success}")
+            if result.success:
+                data = json.loads(result.extracted_content)
+                print(json.dumps(data, indent=2))
+            else:
+                print("Failed to extract structured data")
+
+async def demo_css_structured_extraction_no_schema():
+    """Extract structured data using CSS selectors"""
+    print("\n=== 5. CSS-Based Structured Extraction ===")
+    # Sample HTML for schema generation (one-time cost)
+    sample_html = """
+<div class="body-post clear">
+    <a class="story-link" href="https://thehackernews.com/2025/04/malicious-python-packages-on-pypi.html">
+        <div class="clear home-post-box cf">
+            <div class="home-img clear">
+                <div class="img-ratio">
+                    <img alt="..." src="...">
+                </div>
+            </div>
+            <div class="clear home-right">
+                <h2 class="home-title">Malicious Python Packages on PyPI Downloaded 39,000+ Times, Steal Sensitive Data</h2>
+                <div class="item-label">
+                    <span class="h-datetime"><i class="icon-font icon-calendar"></i>Apr 05, 2025</span>
+                    <span class="h-tags">Malware / Supply Chain Attack</span>
+                </div>
+                <div class="home-desc"> Cybersecurity researchers have...</div>
+            </div>
+        </div>
+    </a>
+</div>
+    """
+
+    # Check if schema file exists
+    schema_file_path = f"{__cur_dir__}/tmp/schema.json"
+    if os.path.exists(schema_file_path):
+        with open(schema_file_path, "r") as f:
+            schema = json.load(f)
+    else:
+        # Generate schema using LLM (one-time setup)
+        schema = JsonCssExtractionStrategy.generate_schema(
+            html=sample_html,
+            llm_config=LLMConfig(
+                provider="groq/qwen-2.5-32b",
+                api_token="env:GROQ_API_KEY",
+            ),
+            query="From https://thehackernews.com/, I have shared a sample of one news div with a title, date, and description. Please generate a schema for this news div.",
+        )
+
+    print(f"Generated schema: {json.dumps(schema, indent=2)}")
+    # Save the schema to a file , and use it for future extractions, in result for such extraction you will call LLM once
+    with open(f"{__cur_dir__}/tmp/schema.json", "w") as f:
+        json.dump(schema, f, indent=2)
+
+    # Create no-LLM extraction strategy with the generated schema
+    extraction_strategy = JsonCssExtractionStrategy(schema)
+    config = CrawlerRunConfig(extraction_strategy=extraction_strategy)
+
+    # Use the fast CSS extraction (no LLM calls during extraction)
+    async with AsyncWebCrawler() as crawler:
+        results: List[CrawlResult] = await crawler.arun(
+            "https://thehackernews.com", config=config
+        )
+
+        for result in results:
+            print(f"URL: {result.url}")
+            print(f"Success: {result.success}")
+            if result.success:
+                data = json.loads(result.extracted_content)
+                print(json.dumps(data, indent=2))
+            else:
+                print("Failed to extract structured data")
+
+async def demo_deep_crawl():
+    """Deep crawling with BFS strategy"""
+    print("\n=== 6. Deep Crawling ===")
+
+    filter_chain = FilterChain([DomainFilter(allowed_domains=["crawl4ai.com"])])
+
+    deep_crawl_strategy = BFSDeepCrawlStrategy(
+        max_depth=1, max_pages=5, filter_chain=filter_chain
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        results: List[CrawlResult] = await crawler.arun(
+            url="https://docs.crawl4ai.com",
+            config=CrawlerRunConfig(deep_crawl_strategy=deep_crawl_strategy),
+        )
+
+        print(f"Deep crawl returned {len(results)} pages:")
+        for i, result in enumerate(results):
+            depth = result.metadata.get("depth", "unknown")
+            print(f"  {i + 1}. {result.url} (Depth: {depth})")
+
+async def demo_js_interaction():
+    """Execute JavaScript to load more content"""
+    print("\n=== 7. JavaScript Interaction ===")
+
+    # A simple page that needs JS to reveal content
+    async with AsyncWebCrawler(config=BrowserConfig(headless=False)) as crawler:
+        # Initial load
+
+        news_schema = {
+            "name": "news",
+            "baseSelector": "tr.athing",
+            "fields": [
+                {
+                    "name": "title",
+                    "selector": "span.titleline",
+                    "type": "text",
+                }
+            ],
+        }
+        results: List[CrawlResult] = await crawler.arun(
+            url="https://news.ycombinator.com",
+            config=CrawlerRunConfig(
+                session_id="hn_session",  # Keep session
+                extraction_strategy=JsonCssExtractionStrategy(schema=news_schema),
+            ),
+        )
+
+        news = []
+        for result in results:
+            if result.success:
+                data = json.loads(result.extracted_content)
+                news.extend(data)
+                print(json.dumps(data, indent=2))
+            else:
+                print("Failed to extract structured data")
+
+        print(f"Initial items: {len(news)}")
+
+        # Click "More" link
+        more_config = CrawlerRunConfig(
+            js_code="document.querySelector('a.morelink').click();",
+            js_only=True,  # Continue in same page
+            session_id="hn_session",  # Keep session
+            extraction_strategy=JsonCssExtractionStrategy(
+                schema=news_schema,
+            ),
+        )
+
+        result: List[CrawlResult] = await crawler.arun(
+            url="https://news.ycombinator.com", config=more_config
+        )
+
+        # Extract new items
+        for result in results:
+            if result.success:
+                data = json.loads(result.extracted_content)
+                news.extend(data)
+                print(json.dumps(data, indent=2))
+            else:
+                print("Failed to extract structured data")
+        print(f"Total items: {len(news)}")
+
+async def demo_media_and_links():
+    """Extract media and links from a page"""
+    print("\n=== 8. Media and Links Extraction ===")
+
+    async with AsyncWebCrawler() as crawler:
+        result: List[CrawlResult] = await crawler.arun("https://en.wikipedia.org/wiki/Main_Page")
+
+        for i, result in enumerate(result):
+            # Extract and save all images
+            images = result.media.get("images", [])
+            print(f"Found {len(images)} images")
+
+            # Extract and save all links (internal and external)
+            internal_links = result.links.get("internal", [])
+            external_links = result.links.get("external", [])
+            print(f"Found {len(internal_links)} internal links")
+            print(f"Found {len(external_links)} external links")
+
+            # Print some of the images and links
+            for image in images[:3]:
+                print(f"Image: {image['src']}")
+            for link in internal_links[:3]:
+                print(f"Internal link: {link['href']}")
+            for link in external_links[:3]:
+                print(f"External link: {link['href']}")
+
+            # # Save everything to files
+            with open(f"{__cur_dir__}/tmp/images.json", "w") as f:
+                json.dump(images, f, indent=2)
+
+            with open(f"{__cur_dir__}/tmp/links.json", "w") as f:
+                json.dump(
+                    {"internal": internal_links, "external": external_links},
+                    f,
+                    indent=2,
+                )
+
+async def demo_screenshot_and_pdf():
+    """Capture screenshot and PDF of a page"""
+    print("\n=== 9. Screenshot and PDF Capture ===")
+
+    async with AsyncWebCrawler() as crawler:
+        result: List[CrawlResult] = await crawler.arun(
+            # url="https://example.com",
+            url="https://en.wikipedia.org/wiki/Giant_anteater",
+            config=CrawlerRunConfig(screenshot=True, pdf=True),
+        )
+
+        for i, result in enumerate(result):
+            # if result.screenshot_data:
+            if result.screenshot:
+                # Save screenshot
+                screenshot_path = f"{__cur_dir__}/tmp/example_screenshot.png"
+                with open(screenshot_path, "wb") as f:
+                    f.write(base64.b64decode(result.screenshot))
+                print(f"Screenshot saved to {screenshot_path}")
+
+            # if result.pdf_data:
+            if result.pdf:
+                # Save PDF
+                pdf_path = f"{__cur_dir__}/tmp/example.pdf"
+                with open(pdf_path, "wb") as f:
+                    f.write(result.pdf)
+                print(f"PDF saved to {pdf_path}")
+
+async def demo_proxy_rotation():
+    """Proxy rotation for multiple requests"""
+    print("\n=== 10. Proxy Rotation ===")
+
+    # Example proxies (replace with real ones)
+    proxies = [
+        ProxyConfig(server="http://proxy1.example.com:8080"),
+        ProxyConfig(server="http://proxy2.example.com:8080"),
+    ]
+
+    proxy_strategy = RoundRobinProxyStrategy(proxies)
+
+    print(f"Using {len(proxies)} proxies in rotation")
+    print(
+        "Note: This example uses placeholder proxies - replace with real ones to test"
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        config = CrawlerRunConfig(
+            proxy_rotation_strategy=proxy_strategy
+        )
+
+        # In a real scenario, these would be run and the proxies would rotate
+        print("In a real scenario, requests would rotate through the available proxies")
+
+async def demo_raw_html_and_file():
+    """Process raw HTML and local files"""
+    print("\n=== 11. Raw HTML and Local Files ===")
+
+    raw_html = """
+    <html><body>
+        <h1>Sample Article</h1>
+        <p>This is sample content for testing Crawl4AI's raw HTML processing.</p>
+    </body></html>
+    """
+
+    # Save to file
+    file_path = Path("docs/examples/tmp/sample.html").absolute()
+    with open(file_path, "w") as f:
+        f.write(raw_html)
+
+    async with AsyncWebCrawler() as crawler:
+        # Crawl raw HTML
+        raw_result = await crawler.arun(
+            url="raw:" + raw_html, config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+        )
+        print("Raw HTML processing:")
+        print(f"  Markdown: {raw_result.markdown.raw_markdown[:50]}...")
+
+        # Crawl local file
+        file_result = await crawler.arun(
+            url=f"file://{file_path}",
+            config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
+        )
+        print("\nLocal file processing:")
+        print(f"  Markdown: {file_result.markdown.raw_markdown[:50]}...")
+
+    # Clean up
+    os.remove(file_path)
+    print(f"Processed both raw HTML and local file ({file_path})")
+
+async def main():
+    """Run all demo functions sequentially"""
+    print("=== Comprehensive Crawl4AI Demo ===")
+    print("Note: Some examples require API keys or other configurations")
+
+    # Run all demos
+    await demo_basic_crawl()
+    await demo_parallel_crawl()
+    await demo_fit_markdown()
+    await demo_llm_structured_extraction_no_schema()
+    await demo_css_structured_extraction_no_schema()
+    await demo_deep_crawl()
+    await demo_js_interaction()
+    await demo_media_and_links()
+    await demo_screenshot_and_pdf()
+    # # await demo_proxy_rotation()
+    await demo_raw_html_and_file()
+
+    # Clean up any temp files that may have been created
+    print("\n=== Demo Complete ===")
+    print("Check for any generated files (screenshots, PDFs) in the current directory")
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/docs/examples/quickstart_examples_set_2.py
+++ b/docs/examples/quickstart_examples_set_2.py
@ -0,0 +1,562 @@
+import os, sys
+
+from crawl4ai.types import LLMConfig
+
+sys.path.append(
+    os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+)
+
+import asyncio
+import time
+import json
+import re
+from typing import Dict
+from bs4 import BeautifulSoup
+from pydantic import BaseModel, Field
+from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+from crawl4ai.content_filter_strategy import PruningContentFilter
+from crawl4ai.extraction_strategy import (
+    JsonCssExtractionStrategy,
+    LLMExtractionStrategy,
+)
+
+__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
+
+print("Crawl4AI: Advanced Web Crawling and Data Extraction")
+print("GitHub Repository: https://github.com/unclecode/crawl4ai")
+print("Twitter: @unclecode")
+print("Website: https://crawl4ai.com")
+
+
+# Basic Example - Simple Crawl
+async def simple_crawl():
+    print("\n--- Basic Usage ---")
+    browser_config = BrowserConfig(headless=True)
+    crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business", config=crawler_config
+        )
+        print(result.markdown[:500])
+
+
+async def clean_content():
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        excluded_tags=["nav", "footer", "aside"],
+        remove_overlay_elements=True,
+        markdown_generator=DefaultMarkdownGenerator(
+            content_filter=PruningContentFilter(
+                threshold=0.48, threshold_type="fixed", min_word_threshold=0
+            ),
+            options={"ignore_links": True},
+        ),
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://en.wikipedia.org/wiki/Apple",
+            config=crawler_config,
+        )
+        full_markdown_length = len(result.markdown.raw_markdown)
+        fit_markdown_length = len(result.markdown.fit_markdown)
+        print(f"Full Markdown Length: {full_markdown_length}")
+        print(f"Fit Markdown Length: {fit_markdown_length}")
+
+
+async def link_analysis():
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.ENABLED,
+        exclude_external_links=True,
+        exclude_social_media_links=True,
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business",
+            config=crawler_config,
+        )
+        print(f"Found {len(result.links['internal'])} internal links")
+        print(f"Found {len(result.links['external'])} external links")
+
+        for link in result.links["internal"][:5]:
+            print(f"Href: {link['href']}\nText: {link['text']}\n")
+
+
+# JavaScript Execution Example
+async def simple_example_with_running_js_code():
+    print("\n--- Executing JavaScript and Using CSS Selectors ---")
+
+    browser_config = BrowserConfig(headless=True, java_script_enabled=True)
+
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        js_code="const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();",
+        # wait_for="() => { return Array.from(document.querySelectorAll('article.tease-card')).length > 10; }"
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business", config=crawler_config
+        )
+        print(result.markdown[:500])
+
+
+# CSS Selector Example
+async def simple_example_with_css_selector():
+    print("\n--- Using CSS Selectors ---")
+    browser_config = BrowserConfig(headless=True)
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS, css_selector=".wide-tease-item__description"
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business", config=crawler_config
+        )
+        print(result.markdown[:500])
+
+
+async def media_handling():
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS, exclude_external_images=True, screenshot=True
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business", config=crawler_config
+        )
+        for img in result.media["images"][:5]:
+            print(f"Image URL: {img['src']}, Alt: {img['alt']}, Score: {img['score']}")
+
+
+async def custom_hook_workflow(verbose=True):
+    async with AsyncWebCrawler() as crawler:
+        # Set a 'before_goto' hook to run custom code just before navigation
+        crawler.crawler_strategy.set_hook(
+            "before_goto",
+            lambda page, context: print("[Hook] Preparing to navigate..."),
+        )
+
+        # Perform the crawl operation
+        result = await crawler.arun(url="https://crawl4ai.com")
+        print(result.markdown.raw_markdown[:500].replace("\n", " -- "))
+
+
+# Proxy Example
+async def use_proxy():
+    print("\n--- Using a Proxy ---")
+    browser_config = BrowserConfig(
+        headless=True,
+        proxy_config={
+            "server": "http://proxy.example.com:8080",
+            "username": "username",
+            "password": "password",
+        },
+    )
+    crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business", config=crawler_config
+        )
+        if result.success:
+            print(result.markdown[:500])
+
+
+# Screenshot Example
+async def capture_and_save_screenshot(url: str, output_path: str):
+    browser_config = BrowserConfig(headless=True)
+    crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, screenshot=True)
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(url=url, config=crawler_config)
+
+        if result.success and result.screenshot:
+            import base64
+
+            screenshot_data = base64.b64decode(result.screenshot)
+            with open(output_path, "wb") as f:
+                f.write(screenshot_data)
+            print(f"Screenshot saved successfully to {output_path}")
+        else:
+            print("Failed to capture screenshot")
+
+
+# LLM Extraction Example
+class OpenAIModelFee(BaseModel):
+    model_name: str = Field(..., description="Name of the OpenAI model.")
+    input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
+    output_fee: str = Field(
+        ..., description="Fee for output token for the OpenAI model."
+    )
+
+
+async def extract_structured_data_using_llm(
+    provider: str, api_token: str = None, extra_headers: Dict[str, str] = None
+):
+    print(f"\n--- Extracting Structured Data with {provider} ---")
+
+    if api_token is None and provider != "ollama":
+        print(f"API token is required for {provider}. Skipping this example.")
+        return
+
+    browser_config = BrowserConfig(headless=True)
+
+    extra_args = {"temperature": 0, "top_p": 0.9, "max_tokens": 2000}
+    if extra_headers:
+        extra_args["extra_headers"] = extra_headers
+
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        word_count_threshold=1,
+        page_timeout=80000,
+        extraction_strategy=LLMExtractionStrategy(
+            llm_config=LLMConfig(provider=provider,api_token=api_token),
+            schema=OpenAIModelFee.model_json_schema(),
+            extraction_type="schema",
+            instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. 
+            Do not miss any models in the entire content.""",
+            extra_args=extra_args,
+        ),
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://openai.com/api/pricing/", config=crawler_config
+        )
+        print(result.extracted_content)
+
+
+# CSS Extraction Example
+async def extract_structured_data_using_css_extractor():
+    print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
+    schema = {
+        "name": "KidoCode Courses",
+        "baseSelector": "section.charge-methodology .framework-collection-item.w-dyn-item",
+        "fields": [
+            {
+                "name": "section_title",
+                "selector": "h3.heading-50",
+                "type": "text",
+            },
+            {
+                "name": "section_description",
+                "selector": ".charge-content",
+                "type": "text",
+            },
+            {
+                "name": "course_name",
+                "selector": ".text-block-93",
+                "type": "text",
+            },
+            {
+                "name": "course_description",
+                "selector": ".course-content-text",
+                "type": "text",
+            },
+            {
+                "name": "course_icon",
+                "selector": ".image-92",
+                "type": "attribute",
+                "attribute": "src",
+            },
+        ],
+    }
+
+    browser_config = BrowserConfig(headless=True, java_script_enabled=True)
+
+    js_click_tabs = """
+    (async () => {
+        const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div");
+        for(let tab of tabs) {
+            tab.scrollIntoView();
+            tab.click();
+            await new Promise(r => setTimeout(r, 500));
+        }
+    })();
+    """
+
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        extraction_strategy=JsonCssExtractionStrategy(schema),
+        js_code=[js_click_tabs],
+        delay_before_return_html=1
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://www.kidocode.com/degrees/technology", config=crawler_config
+        )
+
+        companies = json.loads(result.extracted_content)
+        print(f"Successfully extracted {len(companies)} companies")
+        print(json.dumps(companies[0], indent=2))
+
+
+# Dynamic Content Examples - Method 1
+async def crawl_dynamic_content_pages_method_1():
+    print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
+    first_commit = ""
+
+    async def on_execution_started(page, **kwargs):
+        nonlocal first_commit
+        try:
+            while True:
+                await page.wait_for_selector("li.Box-sc-g0xbh4-0 h4")
+                commit = await page.query_selector("li.Box-sc-g0xbh4-0 h4")
+                commit = await commit.evaluate("(element) => element.textContent")
+                commit = re.sub(r"\s+", "", commit)
+                if commit and commit != first_commit:
+                    first_commit = commit
+                    break
+                await asyncio.sleep(0.5)
+        except Exception as e:
+            print(f"Warning: New content didn't appear after JavaScript execution: {e}")
+
+    browser_config = BrowserConfig(headless=False, java_script_enabled=True)
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
+
+        url = "https://github.com/microsoft/TypeScript/commits/main"
+        session_id = "typescript_commits_session"
+        all_commits = []
+
+        js_next_page = """
+        const button = document.querySelector('a[data-testid="pagination-next-button"]');
+        if (button) button.click();
+        """
+
+        for page in range(3):
+            crawler_config = CrawlerRunConfig(
+                cache_mode=CacheMode.BYPASS,
+                css_selector="li.Box-sc-g0xbh4-0",
+                js_code=js_next_page if page > 0 else None,
+                js_only=page > 0,
+                session_id=session_id,
+            )
+
+            result = await crawler.arun(url=url, config=crawler_config)
+            assert result.success, f"Failed to crawl page {page + 1}"
+
+            soup = BeautifulSoup(result.cleaned_html, "html.parser")
+            commits = soup.select("li")
+            all_commits.extend(commits)
+
+            print(f"Page {page + 1}: Found {len(commits)} commits")
+
+        print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
+
+
+# Dynamic Content Examples - Method 2
+async def crawl_dynamic_content_pages_method_2():
+    print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
+
+    browser_config = BrowserConfig(headless=False, java_script_enabled=True)
+
+    js_next_page_and_wait = """
+    (async () => {
+        const getCurrentCommit = () => {
+            const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
+            return commits.length > 0 ? commits[0].textContent.trim() : null;
+        };
+
+        const initialCommit = getCurrentCommit();
+        const button = document.querySelector('a[data-testid="pagination-next-button"]');
+        if (button) button.click();
+
+        while (true) {
+            await new Promise(resolve => setTimeout(resolve, 100));
+            const newCommit = getCurrentCommit();
+            if (newCommit && newCommit !== initialCommit) {
+                break;
+            }
+        }
+    })();
+    """
+
+    schema = {
+        "name": "Commit Extractor",
+        "baseSelector": "li.Box-sc-g0xbh4-0",
+        "fields": [
+            {
+                "name": "title",
+                "selector": "h4.markdown-title",
+                "type": "text",
+                "transform": "strip",
+            },
+        ],
+    }
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        url = "https://github.com/microsoft/TypeScript/commits/main"
+        session_id = "typescript_commits_session"
+        all_commits = []
+
+        extraction_strategy = JsonCssExtractionStrategy(schema)
+
+        for page in range(3):
+            crawler_config = CrawlerRunConfig(
+                cache_mode=CacheMode.BYPASS,
+                css_selector="li.Box-sc-g0xbh4-0",
+                extraction_strategy=extraction_strategy,
+                js_code=js_next_page_and_wait if page > 0 else None,
+                js_only=page > 0,
+                session_id=session_id,
+            )
+
+            result = await crawler.arun(url=url, config=crawler_config)
+            assert result.success, f"Failed to crawl page {page + 1}"
+
+            commits = json.loads(result.extracted_content)
+            all_commits.extend(commits)
+            print(f"Page {page + 1}: Found {len(commits)} commits")
+
+        print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
+
+
+async def cosine_similarity_extraction():
+    from crawl4ai.extraction_strategy import CosineStrategy
+    crawl_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        extraction_strategy=CosineStrategy(
+            word_count_threshold=10,
+            max_dist=0.2,  # Maximum distance between two words
+            linkage_method="ward",  # Linkage method for hierarchical clustering (ward, complete, average, single)
+            top_k=3,  # Number of top keywords to extract
+            sim_threshold=0.3,  # Similarity threshold for clustering
+            semantic_filter="McDonald's economic impact, American consumer trends",  # Keywords to filter the content semantically using embeddings
+            verbose=True,
+        ),
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business/consumer/how-mcdonalds-e-coli-crisis-inflation-politics-reflect-american-story-rcna177156",
+            config=crawl_config,
+        )
+        print(json.loads(result.extracted_content)[:5])
+
+
+# Browser Comparison
+async def crawl_custom_browser_type():
+    print("\n--- Browser Comparison ---")
+
+    # Firefox
+    browser_config_firefox = BrowserConfig(browser_type="firefox", headless=True)
+    start = time.time()
+    async with AsyncWebCrawler(config=browser_config_firefox) as crawler:
+        result = await crawler.arun(
+            url="https://www.example.com",
+            config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
+        )
+        print("Firefox:", time.time() - start)
+        print(result.markdown[:500])
+
+    # WebKit
+    browser_config_webkit = BrowserConfig(browser_type="webkit", headless=True)
+    start = time.time()
+    async with AsyncWebCrawler(config=browser_config_webkit) as crawler:
+        result = await crawler.arun(
+            url="https://www.example.com",
+            config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
+        )
+        print("WebKit:", time.time() - start)
+        print(result.markdown[:500])
+
+    # Chromium (default)
+    browser_config_chromium = BrowserConfig(browser_type="chromium", headless=True)
+    start = time.time()
+    async with AsyncWebCrawler(config=browser_config_chromium) as crawler:
+        result = await crawler.arun(
+            url="https://www.example.com",
+            config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
+        )
+        print("Chromium:", time.time() - start)
+        print(result.markdown[:500])
+
+
+# Anti-Bot and User Simulation
+async def crawl_with_user_simulation():
+    browser_config = BrowserConfig(
+        headless=True,
+        user_agent_mode="random",
+        user_agent_generator_config={"device_type": "mobile", "os_type": "android"},
+    )
+
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        magic=True,
+        simulate_user=True,
+        override_navigator=True,
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(url="YOUR-URL-HERE", config=crawler_config)
+        print(result.markdown)
+
+
+async def ssl_certification():
+    # Configure crawler to fetch SSL certificate
+    config = CrawlerRunConfig(
+        fetch_ssl_certificate=True,
+        cache_mode=CacheMode.BYPASS,  # Bypass cache to always get fresh certificates
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(url="https://example.com", config=config)
+
+        if result.success and result.ssl_certificate:
+            cert = result.ssl_certificate
+
+            tmp_dir = os.path.join(__location__, "tmp")
+            os.makedirs(tmp_dir, exist_ok=True)
+
+            # 1. Access certificate properties directly
+            print("\nCertificate Information:")
+            print(f"Issuer: {cert.issuer.get('CN', '')}")
+            print(f"Valid until: {cert.valid_until}")
+            print(f"Fingerprint: {cert.fingerprint}")
+
+            # 2. Export certificate in different formats
+            cert.to_json(os.path.join(tmp_dir, "certificate.json"))  # For analysis
+            print("\nCertificate exported to:")
+            print(f"- JSON: {os.path.join(tmp_dir, 'certificate.json')}")
+
+            pem_data = cert.to_pem(
+                os.path.join(tmp_dir, "certificate.pem")
+            )  # For web servers
+            print(f"- PEM: {os.path.join(tmp_dir, 'certificate.pem')}")
+
+            der_data = cert.to_der(
+                os.path.join(tmp_dir, "certificate.der")
+            )  # For Java apps
+            print(f"- DER: {os.path.join(tmp_dir, 'certificate.der')}")
+
+
+# Main execution
+async def main():
+    # Basic examples
+    await simple_crawl()
+    await simple_example_with_running_js_code()
+    await simple_example_with_css_selector()
+
+    # Advanced examples
+    await extract_structured_data_using_css_extractor()
+    await extract_structured_data_using_llm(
+        "openai/gpt-4o", os.getenv("OPENAI_API_KEY")
+    )
+    await crawl_dynamic_content_pages_method_1()
+    await crawl_dynamic_content_pages_method_2()
+
+    # Browser comparisons
+    await crawl_custom_browser_type()
+
+    # Screenshot example
+    await capture_and_save_screenshot(
+        "https://www.example.com",
+        os.path.join(__location__, "tmp/example_screenshot.jpg")
+    )
+
+
+if __name__ == "__main__":
+    asyncio.run(main())