crawl4ai/docs/examples/extraction_strategies_example.py

"""
Example demonstrating different extraction strategies with various input formats.
This example shows how to:
1. Use different input formats (markdown, HTML, fit_markdown)
2. Work with JSON-based extractors (CSS and XPath)
3. Use LLM-based extraction with different input formats
4. Configure browser and crawler settings properly
"""

import asyncio
import os
from typing import Dict, Any

from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import (
    LLMExtractionStrategy,
    JsonCssExtractionStrategy,
    JsonXPathExtractionStrategy
)
from crawl4ai.chunking_strategy import RegexChunking, IdentityChunking
from crawl4ai.content_filter_strategy import PruningContentFilter
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator

async def run_extraction(crawler: AsyncWebCrawler, url: str, strategy, name: str):
    """Helper function to run extraction with proper configuration"""
    try:
        # Configure the crawler run settings
        config = CrawlerRunConfig(
            cache_mode=CacheMode.BYPASS,
            extraction_strategy=strategy,
            markdown_generator=DefaultMarkdownGenerator(
                content_filter=PruningContentFilter()  # For fit_markdown support
            )
        )
        
        # Run the crawler
        result = await crawler.arun(url=url, config=config)
        
        if result.success:
            print(f"\n=== {name} Results ===")
            print(f"Extracted Content: {result.extracted_content}")
            print(f"Raw Markdown Length: {len(result.markdown_v2.raw_markdown)}")
            print(f"Citations Markdown Length: {len(result.markdown_v2.markdown_with_citations)}")
        else:
            print(f"Error in {name}: Crawl failed")
            
    except Exception as e:
        print(f"Error in {name}: {str(e)}")

async def main():
    # Example URL (replace with actual URL)
    url = "https://example.com/product-page"
    
    # Configure browser settings
    browser_config = BrowserConfig(
        headless=True,
        verbose=True
    )
    
    # Initialize extraction strategies
    
    # 1. LLM Extraction with different input formats
    markdown_strategy = LLMExtractionStrategy(
        provider="openai/gpt-4o-mini",
        api_token=os.getenv("OPENAI_API_KEY"),
        instruction="Extract product information including name, price, and description"
    )
    
    html_strategy = LLMExtractionStrategy(
        input_format="html",
        provider="openai/gpt-4o-mini",
        api_token=os.getenv("OPENAI_API_KEY"),
        instruction="Extract product information from HTML including structured data"
    )
    
    fit_markdown_strategy = LLMExtractionStrategy(
        input_format="fit_markdown",
        provider="openai/gpt-4o-mini",
        api_token=os.getenv("OPENAI_API_KEY"),
        instruction="Extract product information from cleaned markdown"
    )
    
    # 2. JSON CSS Extraction (automatically uses HTML input)
    css_schema = {
        "baseSelector": ".product",
        "fields": [
            {"name": "title", "selector": "h1.product-title", "type": "text"},
            {"name": "price", "selector": ".price", "type": "text"},
            {"name": "description", "selector": ".description", "type": "text"}
        ]
    }
    css_strategy = JsonCssExtractionStrategy(schema=css_schema)
    
    # 3. JSON XPath Extraction (automatically uses HTML input)
    xpath_schema = {
        "baseSelector": "//div[@class='product']",
        "fields": [
            {"name": "title", "selector": ".//h1[@class='product-title']/text()", "type": "text"},
            {"name": "price", "selector": ".//span[@class='price']/text()", "type": "text"},
            {"name": "description", "selector": ".//div[@class='description']/text()", "type": "text"}
        ]
    }
    xpath_strategy = JsonXPathExtractionStrategy(schema=xpath_schema)
    
    # Use context manager for proper resource handling
    async with AsyncWebCrawler(config=browser_config) as crawler:
        # Run all strategies
        await run_extraction(crawler, url, markdown_strategy, "Markdown LLM")
        await run_extraction(crawler, url, html_strategy, "HTML LLM")
        await run_extraction(crawler, url, fit_markdown_strategy, "Fit Markdown LLM")
        await run_extraction(crawler, url, css_strategy, "CSS Extraction")
        await run_extraction(crawler, url, xpath_strategy, "XPath Extraction")

if __name__ == "__main__":
    asyncio.run(main())
Enhance crawler capabilities and documentation - Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation. 2024-12-25 21:34:31 +08:00			`"""`
			`Example demonstrating different extraction strategies with various input formats.`
			`This example shows how to:`
			`1. Use different input formats (markdown, HTML, fit_markdown)`
			`2. Work with JSON-based extractors (CSS and XPath)`
			`3. Use LLM-based extraction with different input formats`
			`4. Configure browser and crawler settings properly`
			`"""`

			`import asyncio`
			`import os`
			`from typing import Dict, Any`

			`from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode`
			`from crawl4ai.extraction_strategy import (`
			`LLMExtractionStrategy,`
			`JsonCssExtractionStrategy,`
			`JsonXPathExtractionStrategy`
			`)`
			`from crawl4ai.chunking_strategy import RegexChunking, IdentityChunking`
			`from crawl4ai.content_filter_strategy import PruningContentFilter`
			`from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator`

			`async def run_extraction(crawler: AsyncWebCrawler, url: str, strategy, name: str):`
			`"""Helper function to run extraction with proper configuration"""`
			`try:`
			`# Configure the crawler run settings`
			`config = CrawlerRunConfig(`
			`cache_mode=CacheMode.BYPASS,`
			`extraction_strategy=strategy,`
			`markdown_generator=DefaultMarkdownGenerator(`
			`content_filter=PruningContentFilter() # For fit_markdown support`
			`)`
			`)`

			`# Run the crawler`
			`result = await crawler.arun(url=url, config=config)`

			`if result.success:`
			`print(f"\n=== {name} Results ===")`
			`print(f"Extracted Content: {result.extracted_content}")`
			`print(f"Raw Markdown Length: {len(result.markdown_v2.raw_markdown)}")`
			`print(f"Citations Markdown Length: {len(result.markdown_v2.markdown_with_citations)}")`
			`else:`
			`print(f"Error in {name}: Crawl failed")`

			`except Exception as e:`
			`print(f"Error in {name}: {str(e)}")`

			`async def main():`
			`# Example URL (replace with actual URL)`
			`url = "https://example.com/product-page"`

			`# Configure browser settings`
			`browser_config = BrowserConfig(`
			`headless=True,`
			`verbose=True`
			`)`

			`# Initialize extraction strategies`

			`# 1. LLM Extraction with different input formats`
			`markdown_strategy = LLMExtractionStrategy(`
			`provider="openai/gpt-4o-mini",`
			`api_token=os.getenv("OPENAI_API_KEY"),`
			`instruction="Extract product information including name, price, and description"`
			`)`

			`html_strategy = LLMExtractionStrategy(`
			`input_format="html",`
			`provider="openai/gpt-4o-mini",`
			`api_token=os.getenv("OPENAI_API_KEY"),`
			`instruction="Extract product information from HTML including structured data"`
			`)`

			`fit_markdown_strategy = LLMExtractionStrategy(`
			`input_format="fit_markdown",`
			`provider="openai/gpt-4o-mini",`
			`api_token=os.getenv("OPENAI_API_KEY"),`
			`instruction="Extract product information from cleaned markdown"`
			`)`

			`# 2. JSON CSS Extraction (automatically uses HTML input)`
			`css_schema = {`
			`"baseSelector": ".product",`
			`"fields": [`
			`{"name": "title", "selector": "h1.product-title", "type": "text"},`
			`{"name": "price", "selector": ".price", "type": "text"},`
			`{"name": "description", "selector": ".description", "type": "text"}`
			`]`
			`}`
			`css_strategy = JsonCssExtractionStrategy(schema=css_schema)`

			`# 3. JSON XPath Extraction (automatically uses HTML input)`
			`xpath_schema = {`
			`"baseSelector": "//div[@class='product']",`
			`"fields": [`
			`{"name": "title", "selector": ".//h1[@class='product-title']/text()", "type": "text"},`
			`{"name": "price", "selector": ".//span[@class='price']/text()", "type": "text"},`
			`{"name": "description", "selector": ".//div[@class='description']/text()", "type": "text"}`
			`]`
			`}`
			`xpath_strategy = JsonXPathExtractionStrategy(schema=xpath_schema)`

			`# Use context manager for proper resource handling`
			`async with AsyncWebCrawler(config=browser_config) as crawler:`
			`# Run all strategies`
			`await run_extraction(crawler, url, markdown_strategy, "Markdown LLM")`
			`await run_extraction(crawler, url, html_strategy, "HTML LLM")`
			`await run_extraction(crawler, url, fit_markdown_strategy, "Fit Markdown LLM")`
			`await run_extraction(crawler, url, css_strategy, "CSS Extraction")`
			`await run_extraction(crawler, url, xpath_strategy, "XPath Extraction")`

			`if __name__ == "__main__":`
			`asyncio.run(main())`