crawl4ai/docs/examples/quickstart_examples_set_1.py

import asyncio
import os
import json
import base64
from pathlib import Path
from typing import List
from crawl4ai.proxy_strategy import ProxyConfig

from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, CrawlResult
from crawl4ai import RoundRobinProxyStrategy
from crawl4ai import JsonCssExtractionStrategy, LLMExtractionStrategy
from crawl4ai import LLMConfig
from crawl4ai import PruningContentFilter, BM25ContentFilter
from crawl4ai import DefaultMarkdownGenerator
from crawl4ai import BFSDeepCrawlStrategy, DomainFilter, FilterChain
from crawl4ai import BrowserConfig

__cur_dir__ = Path(__file__).parent

async def demo_basic_crawl():
    """Basic web crawling with markdown generation"""
    print("\n=== 1. Basic Web Crawling ===")
    async with AsyncWebCrawler(config = BrowserConfig(
        viewport_height=800,
        viewport_width=1200,
        headless=True,
        verbose=True,
    )) as crawler:
        results: List[CrawlResult] = await crawler.arun(
            url="https://news.ycombinator.com/"
        )

        for i, result in enumerate(results):
            print(f"Result {i + 1}:")
            print(f"Success: {result.success}")
            if result.success:
                print(f"Markdown length: {len(result.markdown.raw_markdown)} chars")
                print(f"First 100 chars: {result.markdown.raw_markdown[:100]}...")
            else:
                print("Failed to crawl the URL")

async def demo_parallel_crawl():
    """Crawl multiple URLs in parallel"""
    print("\n=== 2. Parallel Crawling ===")

    urls = [
        "https://news.ycombinator.com/",
        "https://example.com/",
        "https://httpbin.org/html",
    ]

    async with AsyncWebCrawler() as crawler:
        results: List[CrawlResult] = await crawler.arun_many(
            urls=urls,
        )

        print(f"Crawled {len(results)} URLs in parallel:")
        for i, result in enumerate(results):
            print(
                f"  {i + 1}. {result.url} - {'Success' if result.success else 'Failed'}"
            )

async def demo_fit_markdown():
    """Generate focused markdown with LLM content filter"""
    print("\n=== 3. Fit Markdown with LLM Content Filter ===")

    async with AsyncWebCrawler() as crawler:
        result: CrawlResult = await crawler.arun(
            url = "https://en.wikipedia.org/wiki/Python_(programming_language)",
            config=CrawlerRunConfig(
                markdown_generator=DefaultMarkdownGenerator(
                    content_filter=PruningContentFilter()
                )
            ),
        )

        # Print stats and save the fit markdown
        print(f"Raw: {len(result.markdown.raw_markdown)} chars")
        print(f"Fit: {len(result.markdown.fit_markdown)} chars")

async def demo_llm_structured_extraction_no_schema():
    # Create a simple LLM extraction strategy (no schema required)
    extraction_strategy = LLMExtractionStrategy(
        llm_config=LLMConfig(
            provider="groq/qwen-2.5-32b",
            api_token="env:GROQ_API_KEY",
        ),
        instruction="This is news.ycombinator.com, extract all news, and for each, I want title, source url, number of comments.",
        extract_type="schema",
        schema="{title: string, url: string, comments: int}",
        extra_args={
            "temperature": 0.0,
            "max_tokens": 4096,
        },
        verbose=True,
    )

    config = CrawlerRunConfig(extraction_strategy=extraction_strategy)

    async with AsyncWebCrawler() as crawler:
        results: List[CrawlResult] = await crawler.arun(
            "https://news.ycombinator.com/", config=config
        )

        for result in results:
            print(f"URL: {result.url}")
            print(f"Success: {result.success}")
            if result.success:
                data = json.loads(result.extracted_content)
                print(json.dumps(data, indent=2))
            else:
                print("Failed to extract structured data")

async def demo_css_structured_extraction_no_schema():
    """Extract structured data using CSS selectors"""
    print("\n=== 5. CSS-Based Structured Extraction ===")
    # Sample HTML for schema generation (one-time cost)
    sample_html = """
<div class="body-post clear">
    <a class="story-link" href="https://thehackernews.com/2025/04/malicious-python-packages-on-pypi.html">
        <div class="clear home-post-box cf">
            <div class="home-img clear">
                <div class="img-ratio">
                    <img alt="..." src="...">
                </div>
            </div>
            <div class="clear home-right">
                <h2 class="home-title">Malicious Python Packages on PyPI Downloaded 39,000+ Times, Steal Sensitive Data</h2>
                <div class="item-label">
                    <span class="h-datetime"><i class="icon-font icon-calendar"></i>Apr 05, 2025</span>
                    <span class="h-tags">Malware / Supply Chain Attack</span>
                </div>
                <div class="home-desc"> Cybersecurity researchers have...</div>
            </div>
        </div>
    </a>
</div>
    """

    # Check if schema file exists
    schema_file_path = f"{__cur_dir__}/tmp/schema.json"
    if os.path.exists(schema_file_path):
        with open(schema_file_path, "r") as f:
            schema = json.load(f)
    else:
        # Generate schema using LLM (one-time setup)
        schema = JsonCssExtractionStrategy.generate_schema(
            html=sample_html,
            llm_config=LLMConfig(
                provider="groq/qwen-2.5-32b",
                api_token="env:GROQ_API_KEY",
            ),
            query="From https://thehackernews.com/, I have shared a sample of one news div with a title, date, and description. Please generate a schema for this news div.",
        )

    print(f"Generated schema: {json.dumps(schema, indent=2)}")
    # Save the schema to a file , and use it for future extractions, in result for such extraction you will call LLM once
    with open(f"{__cur_dir__}/tmp/schema.json", "w") as f:
        json.dump(schema, f, indent=2)

    # Create no-LLM extraction strategy with the generated schema
    extraction_strategy = JsonCssExtractionStrategy(schema)
    config = CrawlerRunConfig(extraction_strategy=extraction_strategy)

    # Use the fast CSS extraction (no LLM calls during extraction)
    async with AsyncWebCrawler() as crawler:
        results: List[CrawlResult] = await crawler.arun(
            "https://thehackernews.com", config=config
        )

        for result in results:
            print(f"URL: {result.url}")
            print(f"Success: {result.success}")
            if result.success:
                data = json.loads(result.extracted_content)
                print(json.dumps(data, indent=2))
            else:
                print("Failed to extract structured data")

async def demo_deep_crawl():
    """Deep crawling with BFS strategy"""
    print("\n=== 6. Deep Crawling ===")

    filter_chain = FilterChain([DomainFilter(allowed_domains=["crawl4ai.com"])])

    deep_crawl_strategy = BFSDeepCrawlStrategy(
        max_depth=1, max_pages=5, filter_chain=filter_chain
    )

    async with AsyncWebCrawler() as crawler:
        results: List[CrawlResult] = await crawler.arun(
            url="https://docs.crawl4ai.com",
            config=CrawlerRunConfig(deep_crawl_strategy=deep_crawl_strategy),
        )

        print(f"Deep crawl returned {len(results)} pages:")
        for i, result in enumerate(results):
            depth = result.metadata.get("depth", "unknown")
            print(f"  {i + 1}. {result.url} (Depth: {depth})")

async def demo_js_interaction():
    """Execute JavaScript to load more content"""
    print("\n=== 7. JavaScript Interaction ===")

    # A simple page that needs JS to reveal content
    async with AsyncWebCrawler(config=BrowserConfig(headless=False)) as crawler:
        # Initial load

        news_schema = {
            "name": "news",
            "baseSelector": "tr.athing",
            "fields": [
                {
                    "name": "title",
                    "selector": "span.titleline",
                    "type": "text",
                }
            ],
        }
        results: List[CrawlResult] = await crawler.arun(
            url="https://news.ycombinator.com",
            config=CrawlerRunConfig(
                session_id="hn_session",  # Keep session
                extraction_strategy=JsonCssExtractionStrategy(schema=news_schema),
            ),
        )

        news = []
        for result in results:
            if result.success:
                data = json.loads(result.extracted_content)
                news.extend(data)
                print(json.dumps(data, indent=2))
            else:
                print("Failed to extract structured data")

        print(f"Initial items: {len(news)}")

        # Click "More" link
        more_config = CrawlerRunConfig(
            js_code="document.querySelector('a.morelink').click();",
            js_only=True,  # Continue in same page
            session_id="hn_session",  # Keep session
            extraction_strategy=JsonCssExtractionStrategy(
                schema=news_schema,
            ),
        )

        result: List[CrawlResult] = await crawler.arun(
            url="https://news.ycombinator.com", config=more_config
        )

        # Extract new items
        for result in results:
            if result.success:
                data = json.loads(result.extracted_content)
                news.extend(data)
                print(json.dumps(data, indent=2))
            else:
                print("Failed to extract structured data")
        print(f"Total items: {len(news)}")

async def demo_media_and_links():
    """Extract media and links from a page"""
    print("\n=== 8. Media and Links Extraction ===")

    async with AsyncWebCrawler() as crawler:
        result: List[CrawlResult] = await crawler.arun("https://en.wikipedia.org/wiki/Main_Page")

        for i, result in enumerate(result):
            # Extract and save all images
            images = result.media.get("images", [])
            print(f"Found {len(images)} images")

            # Extract and save all links (internal and external)
            internal_links = result.links.get("internal", [])
            external_links = result.links.get("external", [])
            print(f"Found {len(internal_links)} internal links")
            print(f"Found {len(external_links)} external links")

            # Print some of the images and links
            for image in images[:3]:
                print(f"Image: {image['src']}")
            for link in internal_links[:3]:
                print(f"Internal link: {link['href']}")
            for link in external_links[:3]:
                print(f"External link: {link['href']}")

            # # Save everything to files
            with open(f"{__cur_dir__}/tmp/images.json", "w") as f:
                json.dump(images, f, indent=2)

            with open(f"{__cur_dir__}/tmp/links.json", "w") as f:
                json.dump(
                    {"internal": internal_links, "external": external_links},
                    f,
                    indent=2,
                )

async def demo_screenshot_and_pdf():
    """Capture screenshot and PDF of a page"""
    print("\n=== 9. Screenshot and PDF Capture ===")

    async with AsyncWebCrawler() as crawler:
        result: List[CrawlResult] = await crawler.arun(
            # url="https://example.com",
            url="https://en.wikipedia.org/wiki/Giant_anteater",
            config=CrawlerRunConfig(screenshot=True, pdf=True),
        )

        for i, result in enumerate(result):
            # if result.screenshot_data:
            if result.screenshot:
                # Save screenshot
                screenshot_path = f"{__cur_dir__}/tmp/example_screenshot.png"
                with open(screenshot_path, "wb") as f:
                    f.write(base64.b64decode(result.screenshot))
                print(f"Screenshot saved to {screenshot_path}")

            # if result.pdf_data:
            if result.pdf:
                # Save PDF
                pdf_path = f"{__cur_dir__}/tmp/example.pdf"
                with open(pdf_path, "wb") as f:
                    f.write(result.pdf)
                print(f"PDF saved to {pdf_path}")

async def demo_proxy_rotation():
    """Proxy rotation for multiple requests"""
    print("\n=== 10. Proxy Rotation ===")

    # Example proxies (replace with real ones)
    proxies = [
        ProxyConfig(server="http://proxy1.example.com:8080"),
        ProxyConfig(server="http://proxy2.example.com:8080"),
    ]

    proxy_strategy = RoundRobinProxyStrategy(proxies)

    print(f"Using {len(proxies)} proxies in rotation")
    print(
        "Note: This example uses placeholder proxies - replace with real ones to test"
    )

    async with AsyncWebCrawler() as crawler:
        config = CrawlerRunConfig(
            proxy_rotation_strategy=proxy_strategy
        )

        # In a real scenario, these would be run and the proxies would rotate
        print("In a real scenario, requests would rotate through the available proxies")

async def demo_raw_html_and_file():
    """Process raw HTML and local files"""
    print("\n=== 11. Raw HTML and Local Files ===")

    raw_html = """
    <html><body>
        <h1>Sample Article</h1>
        <p>This is sample content for testing Crawl4AI's raw HTML processing.</p>
    </body></html>
    """

    # Save to file
    file_path = Path("docs/examples/tmp/sample.html").absolute()
    with open(file_path, "w") as f:
        f.write(raw_html)

    async with AsyncWebCrawler() as crawler:
        # Crawl raw HTML
        raw_result = await crawler.arun(
            url="raw:" + raw_html, config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
        )
        print("Raw HTML processing:")
        print(f"  Markdown: {raw_result.markdown.raw_markdown[:50]}...")

        # Crawl local file
        file_result = await crawler.arun(
            url=f"file://{file_path}",
            config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
        )
        print("\nLocal file processing:")
        print(f"  Markdown: {file_result.markdown.raw_markdown[:50]}...")

    # Clean up
    os.remove(file_path)
    print(f"Processed both raw HTML and local file ({file_path})")

async def main():
    """Run all demo functions sequentially"""
    print("=== Comprehensive Crawl4AI Demo ===")
    print("Note: Some examples require API keys or other configurations")

    # Run all demos
    await demo_basic_crawl()
    await demo_parallel_crawl()
    await demo_fit_markdown()
    await demo_llm_structured_extraction_no_schema()
    await demo_css_structured_extraction_no_schema()
    await demo_deep_crawl()
    await demo_js_interaction()
    await demo_media_and_links()
    await demo_screenshot_and_pdf()
    # # await demo_proxy_rotation()
    await demo_raw_html_and_file()

    # Clean up any temp files that may have been created
    print("\n=== Demo Complete ===")
    print("Check for any generated files (screenshots, PDFs) in the current directory")

if __name__ == "__main__":
    asyncio.run(main())
Add quickstart example set 2025-04-05 21:37:25 +08:00			`import asyncio`
			`import os`
			`import json`
			`import base64`
			`from pathlib import Path`
			`from typing import List`
refactor(docs): update import paths and clean up example code in quickstart_examples.py 2025-04-05 22:55:56 +08:00			`from crawl4ai.proxy_strategy import ProxyConfig`
refactor(docs): enhance quickstart_examples.py with improved configuration and file handling 2025-04-05 22:57:45 +08:00
			`from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, CrawlResult`
Add quickstart example set 2025-04-05 21:37:25 +08:00			`from crawl4ai import RoundRobinProxyStrategy`
			`from crawl4ai import JsonCssExtractionStrategy, LLMExtractionStrategy`
			`from crawl4ai import LLMConfig`
refactor(docs): enhance quickstart_examples.py with improved configuration and file handling 2025-04-05 22:57:45 +08:00			`from crawl4ai import PruningContentFilter, BM25ContentFilter`
Add quickstart example set 2025-04-05 21:37:25 +08:00			`from crawl4ai import DefaultMarkdownGenerator`
			`from crawl4ai import BFSDeepCrawlStrategy, DomainFilter, FilterChain`
			`from crawl4ai import BrowserConfig`

			`__cur_dir__ = Path(__file__).parent`

			`async def demo_basic_crawl():`
			`"""Basic web crawling with markdown generation"""`
			`print("\n=== 1. Basic Web Crawling ===")`
refactor(docs): enhance quickstart_examples.py with improved configuration and file handling 2025-04-05 22:57:45 +08:00			`async with AsyncWebCrawler(config = BrowserConfig(`
			`viewport_height=800,`
			`viewport_width=1200,`
			`headless=True,`
			`verbose=True,`
			`)) as crawler:`
Add quickstart example set 2025-04-05 21:37:25 +08:00			`results: List[CrawlResult] = await crawler.arun(`
refactor(docs): update import paths and clean up example code in quickstart_examples.py 2025-04-05 22:55:56 +08:00			`url="https://news.ycombinator.com/"`
Add quickstart example set 2025-04-05 21:37:25 +08:00			`)`

			`for i, result in enumerate(results):`
			`print(f"Result {i + 1}:")`
			`print(f"Success: {result.success}")`
			`if result.success:`
			`print(f"Markdown length: {len(result.markdown.raw_markdown)} chars")`
			`print(f"First 100 chars: {result.markdown.raw_markdown[:100]}...")`
			`else:`
			`print("Failed to crawl the URL")`

			`async def demo_parallel_crawl():`
			`"""Crawl multiple URLs in parallel"""`
			`print("\n=== 2. Parallel Crawling ===")`

			`urls = [`
			`"https://news.ycombinator.com/",`
			`"https://example.com/",`
			`"https://httpbin.org/html",`
			`]`

			`async with AsyncWebCrawler() as crawler:`
			`results: List[CrawlResult] = await crawler.arun_many(`
			`urls=urls,`
			`)`

			`print(f"Crawled {len(results)} URLs in parallel:")`
			`for i, result in enumerate(results):`
			`print(`
			`f" {i + 1}. {result.url} - {'Success' if result.success else 'Failed'}"`
			`)`

			`async def demo_fit_markdown():`
			`"""Generate focused markdown with LLM content filter"""`
			`print("\n=== 3. Fit Markdown with LLM Content Filter ===")`

			`async with AsyncWebCrawler() as crawler:`
refactor(docs): update import paths and clean up example code in quickstart_examples.py 2025-04-05 22:55:56 +08:00			`result: CrawlResult = await crawler.arun(`
			`url = "https://en.wikipedia.org/wiki/Python_(programming_language)",`
Add quickstart example set 2025-04-05 21:37:25 +08:00			`config=CrawlerRunConfig(`
			`markdown_generator=DefaultMarkdownGenerator(`
			`content_filter=PruningContentFilter()`
			`)`
			`),`
			`)`

			`# Print stats and save the fit markdown`
			`print(f"Raw: {len(result.markdown.raw_markdown)} chars")`
			`print(f"Fit: {len(result.markdown.fit_markdown)} chars")`

			`async def demo_llm_structured_extraction_no_schema():`
			`# Create a simple LLM extraction strategy (no schema required)`
			`extraction_strategy = LLMExtractionStrategy(`
			`llm_config=LLMConfig(`
			`provider="groq/qwen-2.5-32b",`
			`api_token="env:GROQ_API_KEY",`
			`),`
refactor(docs): update import paths and clean up example code in quickstart_examples.py 2025-04-05 22:55:56 +08:00			`instruction="This is news.ycombinator.com, extract all news, and for each, I want title, source url, number of comments.",`
Add quickstart example set 2025-04-05 21:37:25 +08:00			`extract_type="schema",`
			`schema="{title: string, url: string, comments: int}",`
			`extra_args={`
			`"temperature": 0.0,`
			`"max_tokens": 4096,`
			`},`
			`verbose=True,`
			`)`

			`config = CrawlerRunConfig(extraction_strategy=extraction_strategy)`

			`async with AsyncWebCrawler() as crawler:`
			`results: List[CrawlResult] = await crawler.arun(`
			`"https://news.ycombinator.com/", config=config`
			`)`

			`for result in results:`
			`print(f"URL: {result.url}")`
			`print(f"Success: {result.success}")`
			`if result.success:`
			`data = json.loads(result.extracted_content)`
			`print(json.dumps(data, indent=2))`
			`else:`
			`print("Failed to extract structured data")`

			`async def demo_css_structured_extraction_no_schema():`
			`"""Extract structured data using CSS selectors"""`
			`print("\n=== 5. CSS-Based Structured Extraction ===")`
			`# Sample HTML for schema generation (one-time cost)`
			`sample_html = """`
			`<div class="body-post clear">`
			`<a class="story-link" href="https://thehackernews.com/2025/04/malicious-python-packages-on-pypi.html">`
			`<div class="clear home-post-box cf">`
			`<div class="home-img clear">`
			`<div class="img-ratio">`
			`<img alt="..." src="...">`
			`</div>`
			`</div>`
			`<div class="clear home-right">`
			`<h2 class="home-title">Malicious Python Packages on PyPI Downloaded 39,000+ Times, Steal Sensitive Data</h2>`
			`<div class="item-label">`
			`<span class="h-datetime"><i class="icon-font icon-calendar"></i>Apr 05, 2025</span>`
			`<span class="h-tags">Malware / Supply Chain Attack</span>`
			`</div>`
refactor(docs): update import paths and clean up example code in quickstart_examples.py 2025-04-05 22:55:56 +08:00			`<div class="home-desc"> Cybersecurity researchers have...</div>`
Add quickstart example set 2025-04-05 21:37:25 +08:00			`</div>`
			`</div>`
			`</a>`
			`</div>`
			`"""`

refactor(docs): update import paths and clean up example code in quickstart_examples.py 2025-04-05 22:55:56 +08:00			`# Check if schema file exists`
			`schema_file_path = f"{__cur_dir__}/tmp/schema.json"`
			`if os.path.exists(schema_file_path):`
			`with open(schema_file_path, "r") as f:`
			`schema = json.load(f)`
			`else:`
			`# Generate schema using LLM (one-time setup)`
			`schema = JsonCssExtractionStrategy.generate_schema(`
			`html=sample_html,`
			`llm_config=LLMConfig(`
			`provider="groq/qwen-2.5-32b",`
			`api_token="env:GROQ_API_KEY",`
			`),`
			`query="From https://thehackernews.com/, I have shared a sample of one news div with a title, date, and description. Please generate a schema for this news div.",`
			`)`
Add quickstart example set 2025-04-05 21:37:25 +08:00
			`print(f"Generated schema: {json.dumps(schema, indent=2)}")`
			`# Save the schema to a file , and use it for future extractions, in result for such extraction you will call LLM once`
refactor(docs): update import paths and clean up example code in quickstart_examples.py 2025-04-05 22:55:56 +08:00			`with open(f"{__cur_dir__}/tmp/schema.json", "w") as f:`
			`json.dump(schema, f, indent=2)`
Add quickstart example set 2025-04-05 21:37:25 +08:00
			`# Create no-LLM extraction strategy with the generated schema`
			`extraction_strategy = JsonCssExtractionStrategy(schema)`
			`config = CrawlerRunConfig(extraction_strategy=extraction_strategy)`

			`# Use the fast CSS extraction (no LLM calls during extraction)`
			`async with AsyncWebCrawler() as crawler:`
			`results: List[CrawlResult] = await crawler.arun(`
			`"https://thehackernews.com", config=config`
			`)`

			`for result in results:`
			`print(f"URL: {result.url}")`
			`print(f"Success: {result.success}")`
			`if result.success:`
			`data = json.loads(result.extracted_content)`
			`print(json.dumps(data, indent=2))`
			`else:`
			`print("Failed to extract structured data")`

			`async def demo_deep_crawl():`
			`"""Deep crawling with BFS strategy"""`
			`print("\n=== 6. Deep Crawling ===")`

			`filter_chain = FilterChain([DomainFilter(allowed_domains=["crawl4ai.com"])])`

			`deep_crawl_strategy = BFSDeepCrawlStrategy(`
			`max_depth=1, max_pages=5, filter_chain=filter_chain`
			`)`

			`async with AsyncWebCrawler() as crawler:`
			`results: List[CrawlResult] = await crawler.arun(`
			`url="https://docs.crawl4ai.com",`
			`config=CrawlerRunConfig(deep_crawl_strategy=deep_crawl_strategy),`
			`)`

			`print(f"Deep crawl returned {len(results)} pages:")`
			`for i, result in enumerate(results):`
			`depth = result.metadata.get("depth", "unknown")`
			`print(f" {i + 1}. {result.url} (Depth: {depth})")`

			`async def demo_js_interaction():`
			`"""Execute JavaScript to load more content"""`
			`print("\n=== 7. JavaScript Interaction ===")`

			`# A simple page that needs JS to reveal content`
			`async with AsyncWebCrawler(config=BrowserConfig(headless=False)) as crawler:`
			`# Initial load`

			`news_schema = {`
			`"name": "news",`
			`"baseSelector": "tr.athing",`
			`"fields": [`
			`{`
			`"name": "title",`
			`"selector": "span.titleline",`
			`"type": "text",`
			`}`
			`],`
			`}`
			`results: List[CrawlResult] = await crawler.arun(`
			`url="https://news.ycombinator.com",`
			`config=CrawlerRunConfig(`
			`session_id="hn_session", # Keep session`
			`extraction_strategy=JsonCssExtractionStrategy(schema=news_schema),`
			`),`
			`)`

			`news = []`
			`for result in results:`
			`if result.success:`
			`data = json.loads(result.extracted_content)`
			`news.extend(data)`
			`print(json.dumps(data, indent=2))`
			`else:`
			`print("Failed to extract structured data")`

			`print(f"Initial items: {len(news)}")`

			`# Click "More" link`
			`more_config = CrawlerRunConfig(`
			`js_code="document.querySelector('a.morelink').click();",`
			`js_only=True, # Continue in same page`
			`session_id="hn_session", # Keep session`
			`extraction_strategy=JsonCssExtractionStrategy(`
			`schema=news_schema,`
			`),`
			`)`

			`result: List[CrawlResult] = await crawler.arun(`
			`url="https://news.ycombinator.com", config=more_config`
			`)`

			`# Extract new items`
			`for result in results:`
			`if result.success:`
			`data = json.loads(result.extracted_content)`
			`news.extend(data)`
			`print(json.dumps(data, indent=2))`
			`else:`
			`print("Failed to extract structured data")`
			`print(f"Total items: {len(news)}")`

			`async def demo_media_and_links():`
			`"""Extract media and links from a page"""`
			`print("\n=== 8. Media and Links Extraction ===")`

			`async with AsyncWebCrawler() as crawler:`
			`result: List[CrawlResult] = await crawler.arun("https://en.wikipedia.org/wiki/Main_Page")`

			`for i, result in enumerate(result):`
			`# Extract and save all images`
			`images = result.media.get("images", [])`
			`print(f"Found {len(images)} images")`

			`# Extract and save all links (internal and external)`
			`internal_links = result.links.get("internal", [])`
			`external_links = result.links.get("external", [])`
			`print(f"Found {len(internal_links)} internal links")`
			`print(f"Found {len(external_links)} external links")`

refactor(docs): update import paths and clean up example code in quickstart_examples.py 2025-04-05 22:55:56 +08:00			`# Print some of the images and links`
			`for image in images[:3]:`
			`print(f"Image: {image['src']}")`
			`for link in internal_links[:3]:`
			`print(f"Internal link: {link['href']}")`
			`for link in external_links[:3]:`
			`print(f"External link: {link['href']}")`

			`# # Save everything to files`
refactor(docs): enhance quickstart_examples.py with improved configuration and file handling 2025-04-05 22:57:45 +08:00			`with open(f"{__cur_dir__}/tmp/images.json", "w") as f:`
			`json.dump(images, f, indent=2)`

			`with open(f"{__cur_dir__}/tmp/links.json", "w") as f:`
			`json.dump(`
			`{"internal": internal_links, "external": external_links},`
			`f,`
			`indent=2,`
			`)`
Add quickstart example set 2025-04-05 21:37:25 +08:00
			`async def demo_screenshot_and_pdf():`
			`"""Capture screenshot and PDF of a page"""`
			`print("\n=== 9. Screenshot and PDF Capture ===")`

			`async with AsyncWebCrawler() as crawler:`
			`result: List[CrawlResult] = await crawler.arun(`
			`# url="https://example.com",`
			`url="https://en.wikipedia.org/wiki/Giant_anteater",`
			`config=CrawlerRunConfig(screenshot=True, pdf=True),`
			`)`

			`for i, result in enumerate(result):`
refactor(docs): update import paths and clean up example code in quickstart_examples.py 2025-04-05 22:55:56 +08:00			`# if result.screenshot_data:`
Add quickstart example set 2025-04-05 21:37:25 +08:00			`if result.screenshot:`
			`# Save screenshot`
			`screenshot_path = f"{__cur_dir__}/tmp/example_screenshot.png"`
			`with open(screenshot_path, "wb") as f:`
			`f.write(base64.b64decode(result.screenshot))`
			`print(f"Screenshot saved to {screenshot_path}")`

refactor(docs): update import paths and clean up example code in quickstart_examples.py 2025-04-05 22:55:56 +08:00			`# if result.pdf_data:`
Add quickstart example set 2025-04-05 21:37:25 +08:00			`if result.pdf:`
			`# Save PDF`
			`pdf_path = f"{__cur_dir__}/tmp/example.pdf"`
			`with open(pdf_path, "wb") as f:`
			`f.write(result.pdf)`
			`print(f"PDF saved to {pdf_path}")`

			`async def demo_proxy_rotation():`
			`"""Proxy rotation for multiple requests"""`
			`print("\n=== 10. Proxy Rotation ===")`

			`# Example proxies (replace with real ones)`
			`proxies = [`
			`ProxyConfig(server="http://proxy1.example.com:8080"),`
			`ProxyConfig(server="http://proxy2.example.com:8080"),`
			`]`

			`proxy_strategy = RoundRobinProxyStrategy(proxies)`

			`print(f"Using {len(proxies)} proxies in rotation")`
			`print(`
			`"Note: This example uses placeholder proxies - replace with real ones to test"`
			`)`

			`async with AsyncWebCrawler() as crawler:`
			`config = CrawlerRunConfig(`
refactor(docs): enhance quickstart_examples.py with improved configuration and file handling 2025-04-05 22:57:45 +08:00			`proxy_rotation_strategy=proxy_strategy`
Add quickstart example set 2025-04-05 21:37:25 +08:00			`)`

			`# In a real scenario, these would be run and the proxies would rotate`
			`print("In a real scenario, requests would rotate through the available proxies")`

			`async def demo_raw_html_and_file():`
			`"""Process raw HTML and local files"""`
			`print("\n=== 11. Raw HTML and Local Files ===")`

			`raw_html = """`
			`<html><body>`
			`<h1>Sample Article</h1>`
			`<p>This is sample content for testing Crawl4AI's raw HTML processing.</p>`
			`</body></html>`
			`"""`

			`# Save to file`
			`file_path = Path("docs/examples/tmp/sample.html").absolute()`
			`with open(file_path, "w") as f:`
			`f.write(raw_html)`

			`async with AsyncWebCrawler() as crawler:`
			`# Crawl raw HTML`
			`raw_result = await crawler.arun(`
			`url="raw:" + raw_html, config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)`
			`)`
			`print("Raw HTML processing:")`
			`print(f" Markdown: {raw_result.markdown.raw_markdown[:50]}...")`

			`# Crawl local file`
			`file_result = await crawler.arun(`
			`url=f"file://{file_path}",`
			`config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),`
			`)`
			`print("\nLocal file processing:")`
			`print(f" Markdown: {file_result.markdown.raw_markdown[:50]}...")`

			`# Clean up`
			`os.remove(file_path)`
			`print(f"Processed both raw HTML and local file ({file_path})")`

			`async def main():`
			`"""Run all demo functions sequentially"""`
			`print("=== Comprehensive Crawl4AI Demo ===")`
			`print("Note: Some examples require API keys or other configurations")`

			`# Run all demos`
refactor(docs): enhance quickstart_examples.py with improved configuration and file handling 2025-04-05 22:57:45 +08:00			`await demo_basic_crawl()`
			`await demo_parallel_crawl()`
			`await demo_fit_markdown()`
			`await demo_llm_structured_extraction_no_schema()`
			`await demo_css_structured_extraction_no_schema()`
Add quickstart example set 2025-04-05 21:37:25 +08:00			`await demo_deep_crawl()`
refactor(docs): enhance quickstart_examples.py with improved configuration and file handling 2025-04-05 22:57:45 +08:00			`await demo_js_interaction()`
			`await demo_media_and_links()`
			`await demo_screenshot_and_pdf()`
refactor(docs): update import paths and clean up example code in quickstart_examples.py 2025-04-05 22:55:56 +08:00			`# # await demo_proxy_rotation()`
refactor(docs): enhance quickstart_examples.py with improved configuration and file handling 2025-04-05 22:57:45 +08:00			`await demo_raw_html_and_file()`
Add quickstart example set 2025-04-05 21:37:25 +08:00
			`# Clean up any temp files that may have been created`
			`print("\n=== Demo Complete ===")`
			`print("Check for any generated files (screenshots, PDFs) in the current directory")`

			`if __name__ == "__main__":`
			`asyncio.run(main())`