crawl4ai/docs/examples/extraction_strategies_examples.py

"""
Example demonstrating different extraction strategies with various input formats.
This example shows how to:
1. Use different input formats (markdown, HTML, fit_markdown)
2. Work with JSON-based extractors (CSS and XPath)
3. Use LLM-based extraction with different input formats
4. Configure browser and crawler settings properly
"""

import asyncio
import os

from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai import LLMConfig
from crawl4ai.extraction_strategy import (
    LLMExtractionStrategy,
    JsonCssExtractionStrategy,
    JsonXPathExtractionStrategy,
)
from crawl4ai.content_filter_strategy import PruningContentFilter
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator


async def run_extraction(crawler: AsyncWebCrawler, url: str, strategy, name: str):
    """Helper function to run extraction with proper configuration"""
    try:
        # Configure the crawler run settings
        config = CrawlerRunConfig(
            cache_mode=CacheMode.BYPASS,
            extraction_strategy=strategy,
            markdown_generator=DefaultMarkdownGenerator(
                content_filter=PruningContentFilter()  # For fit_markdown support
            ),
        )

        # Run the crawler
        result = await crawler.arun(url=url, config=config)

        if result.success:
            print(f"\n=== {name} Results ===")
            print(f"Extracted Content: {result.extracted_content}")
            print(f"Raw Markdown Length: {len(result.markdown.raw_markdown)}")
            print(
                f"Citations Markdown Length: {len(result.markdown.markdown_with_citations)}"
            )
        else:
            print(f"Error in {name}: Crawl failed")

    except Exception as e:
        print(f"Error in {name}: {str(e)}")


async def main():
    # Example URL (replace with actual URL)
    url = "https://example.com/product-page"

    # Configure browser settings
    browser_config = BrowserConfig(headless=True, verbose=True)

    # Initialize extraction strategies

    # 1. LLM Extraction with different input formats
    markdown_strategy = LLMExtractionStrategy(
        llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")),
        instruction="Extract product information including name, price, and description",
    )

    html_strategy = LLMExtractionStrategy(
        input_format="html",
        llm_config=LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")),
        instruction="Extract product information from HTML including structured data",
    )

    fit_markdown_strategy = LLMExtractionStrategy(
        input_format="fit_markdown",
        llm_config=LLMConfig(provider="openai/gpt-4o-mini",api_token=os.getenv("OPENAI_API_KEY")),
        instruction="Extract product information from cleaned markdown",
    )

    # 2. JSON CSS Extraction (automatically uses HTML input)
    css_schema = {
        "baseSelector": ".product",
        "fields": [
            {"name": "title", "selector": "h1.product-title", "type": "text"},
            {"name": "price", "selector": ".price", "type": "text"},
            {"name": "description", "selector": ".description", "type": "text"},
        ],
    }
    css_strategy = JsonCssExtractionStrategy(schema=css_schema)

    # 3. JSON XPath Extraction (automatically uses HTML input)
    xpath_schema = {
        "baseSelector": "//div[@class='product']",
        "fields": [
            {
                "name": "title",
                "selector": ".//h1[@class='product-title']/text()",
                "type": "text",
            },
            {
                "name": "price",
                "selector": ".//span[@class='price']/text()",
                "type": "text",
            },
            {
                "name": "description",
                "selector": ".//div[@class='description']/text()",
                "type": "text",
            },
        ],
    }
    xpath_strategy = JsonXPathExtractionStrategy(schema=xpath_schema)

    # Use context manager for proper resource handling
    async with AsyncWebCrawler(config=browser_config) as crawler:
        # Run all strategies
        await run_extraction(crawler, url, markdown_strategy, "Markdown LLM")
        await run_extraction(crawler, url, html_strategy, "HTML LLM")
        await run_extraction(crawler, url, fit_markdown_strategy, "Fit Markdown LLM")
        await run_extraction(crawler, url, css_strategy, "CSS Extraction")
        await run_extraction(crawler, url, xpath_strategy, "XPath Extraction")


if __name__ == "__main__":
    asyncio.run(main())
Enhance crawler capabilities and documentation - Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation. 2024-12-25 21:34:31 +08:00			`"""`
			`Example demonstrating different extraction strategies with various input formats.`
			`This example shows how to:`
			`1. Use different input formats (markdown, HTML, fit_markdown)`
			`2. Work with JSON-based extractors (CSS and XPath)`
			`3. Use LLM-based extraction with different input formats`
			`4. Configure browser and crawler settings properly`
			`"""`

			`import asyncio`
			`import os`

			`from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode`
feat(browser): add standalone CDP browser launch and lxml extraction strategy Add new features to enhance browser automation and HTML extraction: - Add CDP browser launch capability with customizable ports and profiles - Implement JsonLxmlExtractionStrategy for faster HTML parsing - Add CLI command 'crwl cdp' for launching standalone CDP browsers - Support connecting to external CDP browsers via URL - Optimize selector caching and context-sensitive queries BREAKING CHANGE: LLMConfig import path changed from crawl4ai.types to crawl4ai 2025-03-07 20:55:56 +08:00			`from crawl4ai import LLMConfig`
Enhance crawler capabilities and documentation - Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation. 2024-12-25 21:34:31 +08:00			`from crawl4ai.extraction_strategy import (`
			`LLMExtractionStrategy,`
			`JsonCssExtractionStrategy,`
Apply Ruff Corrections 2025-01-13 19:19:58 +08:00			`JsonXPathExtractionStrategy,`
Enhance crawler capabilities and documentation - Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation. 2024-12-25 21:34:31 +08:00			`)`
			`from crawl4ai.content_filter_strategy import PruningContentFilter`
			`from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator`

Apply Ruff Corrections 2025-01-13 19:19:58 +08:00
Enhance crawler capabilities and documentation - Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation. 2024-12-25 21:34:31 +08:00			`async def run_extraction(crawler: AsyncWebCrawler, url: str, strategy, name: str):`
			`"""Helper function to run extraction with proper configuration"""`
			`try:`
			`# Configure the crawler run settings`
			`config = CrawlerRunConfig(`
			`cache_mode=CacheMode.BYPASS,`
			`extraction_strategy=strategy,`
			`markdown_generator=DefaultMarkdownGenerator(`
			`content_filter=PruningContentFilter() # For fit_markdown support`
Apply Ruff Corrections 2025-01-13 19:19:58 +08:00			`),`
Enhance crawler capabilities and documentation - Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation. 2024-12-25 21:34:31 +08:00			`)`
Apply Ruff Corrections 2025-01-13 19:19:58 +08:00
Enhance crawler capabilities and documentation - Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation. 2024-12-25 21:34:31 +08:00			`# Run the crawler`
			`result = await crawler.arun(url=url, config=config)`
Apply Ruff Corrections 2025-01-13 19:19:58 +08:00
Enhance crawler capabilities and documentation - Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation. 2024-12-25 21:34:31 +08:00			`if result.success:`
			`print(f"\n=== {name} Results ===")`
			`print(f"Extracted Content: {result.extracted_content}")`
Release prep (#749) * fix: Update export of URLPatternFilter * chore: Add dependancy for cchardet in requirements * docs: Update example for deep crawl in release note for v0.5 * Docs: update the example for memory dispatcher * docs: updated example for crawl strategies * Refactor: Removed wrapping in if __name__==main block since this is a markdown file. * chore: removed cchardet from dependancy list, since unclecode is planning to remove it * docs: updated the example for proxy rotation to a working example * feat: Introduced ProxyConfig param * Add tutorial for deep crawl & update contributor list for bug fixes in feb alpha-1 * chore: update and test new dependancies * feat:Make PyPDF2 a conditional dependancy * updated tutorial and release note for v0.5 * docs: update docs for deep crawl, and fix a typo in docker-deployment markdown filename * refactor: 1. Deprecate markdown_v2 2. Make markdown backward compatible to behave as a string when needed. 3. Fix LlmConfig usage in cli 4. Deprecate markdown_v2 in cli 5. Update AsyncWebCrawler for changes in CrawlResult * fix: Bug in serialisation of markdown in acache_url * Refactor: Added deprecation errors for fit_html and fit_markdown directly on markdown. Now access them via markdown * fix: remove deprecated markdown_v2 from docker * Refactor: remove deprecated fit_markdown and fit_html from result * refactor: fix cache retrieval for markdown as a string * chore: update all docs, examples and tests with deprecation announcements for markdown_v2, fit_html, fit_markdown 2025-02-28 17:23:35 +05:30			`print(f"Raw Markdown Length: {len(result.markdown.raw_markdown)}")`
Apply Ruff Corrections 2025-01-13 19:19:58 +08:00			`print(`
Release prep (#749) * fix: Update export of URLPatternFilter * chore: Add dependancy for cchardet in requirements * docs: Update example for deep crawl in release note for v0.5 * Docs: update the example for memory dispatcher * docs: updated example for crawl strategies * Refactor: Removed wrapping in if __name__==main block since this is a markdown file. * chore: removed cchardet from dependancy list, since unclecode is planning to remove it * docs: updated the example for proxy rotation to a working example * feat: Introduced ProxyConfig param * Add tutorial for deep crawl & update contributor list for bug fixes in feb alpha-1 * chore: update and test new dependancies * feat:Make PyPDF2 a conditional dependancy * updated tutorial and release note for v0.5 * docs: update docs for deep crawl, and fix a typo in docker-deployment markdown filename * refactor: 1. Deprecate markdown_v2 2. Make markdown backward compatible to behave as a string when needed. 3. Fix LlmConfig usage in cli 4. Deprecate markdown_v2 in cli 5. Update AsyncWebCrawler for changes in CrawlResult * fix: Bug in serialisation of markdown in acache_url * Refactor: Added deprecation errors for fit_html and fit_markdown directly on markdown. Now access them via markdown * fix: remove deprecated markdown_v2 from docker * Refactor: remove deprecated fit_markdown and fit_html from result * refactor: fix cache retrieval for markdown as a string * chore: update all docs, examples and tests with deprecation announcements for markdown_v2, fit_html, fit_markdown 2025-02-28 17:23:35 +05:30			`f"Citations Markdown Length: {len(result.markdown.markdown_with_citations)}"`
Apply Ruff Corrections 2025-01-13 19:19:58 +08:00			`)`
Enhance crawler capabilities and documentation - Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation. 2024-12-25 21:34:31 +08:00			`else:`
			`print(f"Error in {name}: Crawl failed")`
Apply Ruff Corrections 2025-01-13 19:19:58 +08:00
Enhance crawler capabilities and documentation - Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation. 2024-12-25 21:34:31 +08:00			`except Exception as e:`
			`print(f"Error in {name}: {str(e)}")`

Apply Ruff Corrections 2025-01-13 19:19:58 +08:00
Enhance crawler capabilities and documentation - Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation. 2024-12-25 21:34:31 +08:00			`async def main():`
			`# Example URL (replace with actual URL)`
			`url = "https://example.com/product-page"`
Apply Ruff Corrections 2025-01-13 19:19:58 +08:00
Enhance crawler capabilities and documentation - Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation. 2024-12-25 21:34:31 +08:00			`# Configure browser settings`
Apply Ruff Corrections 2025-01-13 19:19:58 +08:00			`browser_config = BrowserConfig(headless=True, verbose=True)`

Enhance crawler capabilities and documentation - Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation. 2024-12-25 21:34:31 +08:00			`# Initialize extraction strategies`
Apply Ruff Corrections 2025-01-13 19:19:58 +08:00
Enhance crawler capabilities and documentation - Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation. 2024-12-25 21:34:31 +08:00			`# 1. LLM Extraction with different input formats`
			`markdown_strategy = LLMExtractionStrategy(`
refactor(llm): rename LlmConfig to LLMConfig for consistency Rename LlmConfig to LLMConfig across the codebase to follow consistent naming conventions. Update all imports and usages to use the new name. Update documentation and examples to reflect the change. BREAKING CHANGE: LlmConfig has been renamed to LLMConfig. Users need to update their imports and usage. 2025-03-05 14:17:04 +08:00			`llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")),`
Apply Ruff Corrections 2025-01-13 19:19:58 +08:00			`instruction="Extract product information including name, price, and description",`
Enhance crawler capabilities and documentation - Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation. 2024-12-25 21:34:31 +08:00			`)`
Apply Ruff Corrections 2025-01-13 19:19:58 +08:00
Enhance crawler capabilities and documentation - Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation. 2024-12-25 21:34:31 +08:00			`html_strategy = LLMExtractionStrategy(`
			`input_format="html",`
refactor(llm): rename LlmConfig to LLMConfig for consistency Rename LlmConfig to LLMConfig across the codebase to follow consistent naming conventions. Update all imports and usages to use the new name. Update documentation and examples to reflect the change. BREAKING CHANGE: LlmConfig has been renamed to LLMConfig. Users need to update their imports and usage. 2025-03-05 14:17:04 +08:00			`llm_config=LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")),`
Apply Ruff Corrections 2025-01-13 19:19:58 +08:00			`instruction="Extract product information from HTML including structured data",`
Enhance crawler capabilities and documentation - Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation. 2024-12-25 21:34:31 +08:00			`)`
Apply Ruff Corrections 2025-01-13 19:19:58 +08:00
Enhance crawler capabilities and documentation - Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation. 2024-12-25 21:34:31 +08:00			`fit_markdown_strategy = LLMExtractionStrategy(`
			`input_format="fit_markdown",`
refactor(llm): rename LlmConfig to LLMConfig for consistency Rename LlmConfig to LLMConfig across the codebase to follow consistent naming conventions. Update all imports and usages to use the new name. Update documentation and examples to reflect the change. BREAKING CHANGE: LlmConfig has been renamed to LLMConfig. Users need to update their imports and usage. 2025-03-05 14:17:04 +08:00			`llm_config=LLMConfig(provider="openai/gpt-4o-mini",api_token=os.getenv("OPENAI_API_KEY")),`
Apply Ruff Corrections 2025-01-13 19:19:58 +08:00			`instruction="Extract product information from cleaned markdown",`
Enhance crawler capabilities and documentation - Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation. 2024-12-25 21:34:31 +08:00			`)`
Apply Ruff Corrections 2025-01-13 19:19:58 +08:00
Enhance crawler capabilities and documentation - Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation. 2024-12-25 21:34:31 +08:00			`# 2. JSON CSS Extraction (automatically uses HTML input)`
			`css_schema = {`
			`"baseSelector": ".product",`
			`"fields": [`
			`{"name": "title", "selector": "h1.product-title", "type": "text"},`
			`{"name": "price", "selector": ".price", "type": "text"},`
Apply Ruff Corrections 2025-01-13 19:19:58 +08:00			`{"name": "description", "selector": ".description", "type": "text"},`
			`],`
Enhance crawler capabilities and documentation - Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation. 2024-12-25 21:34:31 +08:00			`}`
			`css_strategy = JsonCssExtractionStrategy(schema=css_schema)`
Apply Ruff Corrections 2025-01-13 19:19:58 +08:00
Enhance crawler capabilities and documentation - Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation. 2024-12-25 21:34:31 +08:00			`# 3. JSON XPath Extraction (automatically uses HTML input)`
			`xpath_schema = {`
			`"baseSelector": "//div[@class='product']",`
			`"fields": [`
Apply Ruff Corrections 2025-01-13 19:19:58 +08:00			`{`
			`"name": "title",`
			`"selector": ".//h1[@class='product-title']/text()",`
			`"type": "text",`
			`},`
			`{`
			`"name": "price",`
			`"selector": ".//span[@class='price']/text()",`
			`"type": "text",`
			`},`
			`{`
			`"name": "description",`
			`"selector": ".//div[@class='description']/text()",`
			`"type": "text",`
			`},`
			`],`
Enhance crawler capabilities and documentation - Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation. 2024-12-25 21:34:31 +08:00			`}`
			`xpath_strategy = JsonXPathExtractionStrategy(schema=xpath_schema)`
Apply Ruff Corrections 2025-01-13 19:19:58 +08:00
Enhance crawler capabilities and documentation - Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation. 2024-12-25 21:34:31 +08:00			`# Use context manager for proper resource handling`
			`async with AsyncWebCrawler(config=browser_config) as crawler:`
			`# Run all strategies`
			`await run_extraction(crawler, url, markdown_strategy, "Markdown LLM")`
			`await run_extraction(crawler, url, html_strategy, "HTML LLM")`
			`await run_extraction(crawler, url, fit_markdown_strategy, "Fit Markdown LLM")`
			`await run_extraction(crawler, url, css_strategy, "CSS Extraction")`
			`await run_extraction(crawler, url, xpath_strategy, "XPath Extraction")`

Apply Ruff Corrections 2025-01-13 19:19:58 +08:00
Enhance crawler capabilities and documentation - Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation. 2024-12-25 21:34:31 +08:00			`if __name__ == "__main__":`
			`asyncio.run(main())`