crawl4ai/docs/examples/extraction_strategies_examples.py

"""
Example demonstrating different extraction strategies with various input formats.
This example shows how to:
1. Use different input formats (markdown, HTML, fit_markdown)
2. Work with JSON-based extractors (CSS and XPath)
3. Use LLM-based extraction with different input formats
4. Configure browser and crawler settings properly
"""

import asyncio
import os

from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.async_configs import LlmConfig
from crawl4ai.extraction_strategy import (
    LLMExtractionStrategy,
    JsonCssExtractionStrategy,
    JsonXPathExtractionStrategy,
)
from crawl4ai.content_filter_strategy import PruningContentFilter
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator


async def run_extraction(crawler: AsyncWebCrawler, url: str, strategy, name: str):
    """Helper function to run extraction with proper configuration"""
    try:
        # Configure the crawler run settings
        config = CrawlerRunConfig(
            cache_mode=CacheMode.BYPASS,
            extraction_strategy=strategy,
            markdown_generator=DefaultMarkdownGenerator(
                content_filter=PruningContentFilter()  # For fit_markdown support
            ),
        )

        # Run the crawler
        result = await crawler.arun(url=url, config=config)

        if result.success:
            print(f"\n=== {name} Results ===")
            print(f"Extracted Content: {result.extracted_content}")
            print(f"Raw Markdown Length: {len(result.markdown.raw_markdown)}")
            print(
                f"Citations Markdown Length: {len(result.markdown.markdown_with_citations)}"
            )
        else:
            print(f"Error in {name}: Crawl failed")

    except Exception as e:
        print(f"Error in {name}: {str(e)}")


async def main():
    # Example URL (replace with actual URL)
    url = "https://example.com/product-page"

    # Configure browser settings
    browser_config = BrowserConfig(headless=True, verbose=True)

    # Initialize extraction strategies

    # 1. LLM Extraction with different input formats
    markdown_strategy = LLMExtractionStrategy(
        llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")),
        instruction="Extract product information including name, price, and description",
    )

    html_strategy = LLMExtractionStrategy(
        input_format="html",
        llmConfig=LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")),
        instruction="Extract product information from HTML including structured data",
    )

    fit_markdown_strategy = LLMExtractionStrategy(
        input_format="fit_markdown",
        llmConfig=LlmConfig(provider="openai/gpt-4o-mini",api_token=os.getenv("OPENAI_API_KEY")),
        instruction="Extract product information from cleaned markdown",
    )

    # 2. JSON CSS Extraction (automatically uses HTML input)
    css_schema = {
        "baseSelector": ".product",
        "fields": [
            {"name": "title", "selector": "h1.product-title", "type": "text"},
            {"name": "price", "selector": ".price", "type": "text"},
            {"name": "description", "selector": ".description", "type": "text"},
        ],
    }
    css_strategy = JsonCssExtractionStrategy(schema=css_schema)

    # 3. JSON XPath Extraction (automatically uses HTML input)
    xpath_schema = {
        "baseSelector": "//div[@class='product']",
        "fields": [
            {
                "name": "title",
                "selector": ".//h1[@class='product-title']/text()",
                "type": "text",
            },
            {
                "name": "price",
                "selector": ".//span[@class='price']/text()",
                "type": "text",
            },
            {
                "name": "description",
                "selector": ".//div[@class='description']/text()",
                "type": "text",
            },
        ],
    }
    xpath_strategy = JsonXPathExtractionStrategy(schema=xpath_schema)

    # Use context manager for proper resource handling
    async with AsyncWebCrawler(config=browser_config) as crawler:
        # Run all strategies
        await run_extraction(crawler, url, markdown_strategy, "Markdown LLM")
        await run_extraction(crawler, url, html_strategy, "HTML LLM")
        await run_extraction(crawler, url, fit_markdown_strategy, "Fit Markdown LLM")
        await run_extraction(crawler, url, css_strategy, "CSS Extraction")
        await run_extraction(crawler, url, xpath_strategy, "XPath Extraction")


if __name__ == "__main__":
    asyncio.run(main())
Enhance crawler capabilities and documentation - Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation. 2024-12-25 21:34:31 +08:00			`"""`
			`Example demonstrating different extraction strategies with various input formats.`
			`This example shows how to:`
			`1. Use different input formats (markdown, HTML, fit_markdown)`
			`2. Work with JSON-based extractors (CSS and XPath)`
			`3. Use LLM-based extraction with different input formats`
			`4. Configure browser and crawler settings properly`
			`"""`

			`import asyncio`
			`import os`

			`from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode`
Feat/llm config (#724) * feature: Add LlmConfig to easily configure and pass LLM configs to different strategies * pulled in next branch and resolved conflicts * feat: Add gemini and deepseek providers. Make ignore_cache in llm content filter to true by default to avoid confusions * Refactor: Update LlmConfig in LLMExtractionStrategy class and deprecate old params * updated tests, docs and readme 2025-02-21 13:11:37 +05:30			`from crawl4ai.async_configs import LlmConfig`
Enhance crawler capabilities and documentation - Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation. 2024-12-25 21:34:31 +08:00			`from crawl4ai.extraction_strategy import (`
			`LLMExtractionStrategy,`
			`JsonCssExtractionStrategy,`
Apply Ruff Corrections 2025-01-13 19:19:58 +08:00			`JsonXPathExtractionStrategy,`
Enhance crawler capabilities and documentation - Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation. 2024-12-25 21:34:31 +08:00			`)`
			`from crawl4ai.content_filter_strategy import PruningContentFilter`
			`from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator`

Apply Ruff Corrections 2025-01-13 19:19:58 +08:00
Enhance crawler capabilities and documentation - Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation. 2024-12-25 21:34:31 +08:00			`async def run_extraction(crawler: AsyncWebCrawler, url: str, strategy, name: str):`
			`"""Helper function to run extraction with proper configuration"""`
			`try:`
			`# Configure the crawler run settings`
			`config = CrawlerRunConfig(`
			`cache_mode=CacheMode.BYPASS,`
			`extraction_strategy=strategy,`
			`markdown_generator=DefaultMarkdownGenerator(`
			`content_filter=PruningContentFilter() # For fit_markdown support`
Apply Ruff Corrections 2025-01-13 19:19:58 +08:00			`),`
Enhance crawler capabilities and documentation - Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation. 2024-12-25 21:34:31 +08:00			`)`
Apply Ruff Corrections 2025-01-13 19:19:58 +08:00
Enhance crawler capabilities and documentation - Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation. 2024-12-25 21:34:31 +08:00			`# Run the crawler`
			`result = await crawler.arun(url=url, config=config)`
Apply Ruff Corrections 2025-01-13 19:19:58 +08:00
Enhance crawler capabilities and documentation - Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation. 2024-12-25 21:34:31 +08:00			`if result.success:`
			`print(f"\n=== {name} Results ===")`
			`print(f"Extracted Content: {result.extracted_content}")`
Release prep (#749) * fix: Update export of URLPatternFilter * chore: Add dependancy for cchardet in requirements * docs: Update example for deep crawl in release note for v0.5 * Docs: update the example for memory dispatcher * docs: updated example for crawl strategies * Refactor: Removed wrapping in if __name__==main block since this is a markdown file. * chore: removed cchardet from dependancy list, since unclecode is planning to remove it * docs: updated the example for proxy rotation to a working example * feat: Introduced ProxyConfig param * Add tutorial for deep crawl & update contributor list for bug fixes in feb alpha-1 * chore: update and test new dependancies * feat:Make PyPDF2 a conditional dependancy * updated tutorial and release note for v0.5 * docs: update docs for deep crawl, and fix a typo in docker-deployment markdown filename * refactor: 1. Deprecate markdown_v2 2. Make markdown backward compatible to behave as a string when needed. 3. Fix LlmConfig usage in cli 4. Deprecate markdown_v2 in cli 5. Update AsyncWebCrawler for changes in CrawlResult * fix: Bug in serialisation of markdown in acache_url * Refactor: Added deprecation errors for fit_html and fit_markdown directly on markdown. Now access them via markdown * fix: remove deprecated markdown_v2 from docker * Refactor: remove deprecated fit_markdown and fit_html from result * refactor: fix cache retrieval for markdown as a string * chore: update all docs, examples and tests with deprecation announcements for markdown_v2, fit_html, fit_markdown 2025-02-28 17:23:35 +05:30			`print(f"Raw Markdown Length: {len(result.markdown.raw_markdown)}")`
Apply Ruff Corrections 2025-01-13 19:19:58 +08:00			`print(`
Release prep (#749) * fix: Update export of URLPatternFilter * chore: Add dependancy for cchardet in requirements * docs: Update example for deep crawl in release note for v0.5 * Docs: update the example for memory dispatcher * docs: updated example for crawl strategies * Refactor: Removed wrapping in if __name__==main block since this is a markdown file. * chore: removed cchardet from dependancy list, since unclecode is planning to remove it * docs: updated the example for proxy rotation to a working example * feat: Introduced ProxyConfig param * Add tutorial for deep crawl & update contributor list for bug fixes in feb alpha-1 * chore: update and test new dependancies * feat:Make PyPDF2 a conditional dependancy * updated tutorial and release note for v0.5 * docs: update docs for deep crawl, and fix a typo in docker-deployment markdown filename * refactor: 1. Deprecate markdown_v2 2. Make markdown backward compatible to behave as a string when needed. 3. Fix LlmConfig usage in cli 4. Deprecate markdown_v2 in cli 5. Update AsyncWebCrawler for changes in CrawlResult * fix: Bug in serialisation of markdown in acache_url * Refactor: Added deprecation errors for fit_html and fit_markdown directly on markdown. Now access them via markdown * fix: remove deprecated markdown_v2 from docker * Refactor: remove deprecated fit_markdown and fit_html from result * refactor: fix cache retrieval for markdown as a string * chore: update all docs, examples and tests with deprecation announcements for markdown_v2, fit_html, fit_markdown 2025-02-28 17:23:35 +05:30			`f"Citations Markdown Length: {len(result.markdown.markdown_with_citations)}"`
Apply Ruff Corrections 2025-01-13 19:19:58 +08:00			`)`
Enhance crawler capabilities and documentation - Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation. 2024-12-25 21:34:31 +08:00			`else:`
			`print(f"Error in {name}: Crawl failed")`
Apply Ruff Corrections 2025-01-13 19:19:58 +08:00
Enhance crawler capabilities and documentation - Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation. 2024-12-25 21:34:31 +08:00			`except Exception as e:`
			`print(f"Error in {name}: {str(e)}")`

Apply Ruff Corrections 2025-01-13 19:19:58 +08:00
Enhance crawler capabilities and documentation - Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation. 2024-12-25 21:34:31 +08:00			`async def main():`
			`# Example URL (replace with actual URL)`
			`url = "https://example.com/product-page"`
Apply Ruff Corrections 2025-01-13 19:19:58 +08:00
Enhance crawler capabilities and documentation - Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation. 2024-12-25 21:34:31 +08:00			`# Configure browser settings`
Apply Ruff Corrections 2025-01-13 19:19:58 +08:00			`browser_config = BrowserConfig(headless=True, verbose=True)`

Enhance crawler capabilities and documentation - Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation. 2024-12-25 21:34:31 +08:00			`# Initialize extraction strategies`
Apply Ruff Corrections 2025-01-13 19:19:58 +08:00
Enhance crawler capabilities and documentation - Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation. 2024-12-25 21:34:31 +08:00			`# 1. LLM Extraction with different input formats`
			`markdown_strategy = LLMExtractionStrategy(`
Feat/llm config (#724) * feature: Add LlmConfig to easily configure and pass LLM configs to different strategies * pulled in next branch and resolved conflicts * feat: Add gemini and deepseek providers. Make ignore_cache in llm content filter to true by default to avoid confusions * Refactor: Update LlmConfig in LLMExtractionStrategy class and deprecate old params * updated tests, docs and readme 2025-02-21 13:11:37 +05:30			`llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")),`
Apply Ruff Corrections 2025-01-13 19:19:58 +08:00			`instruction="Extract product information including name, price, and description",`
Enhance crawler capabilities and documentation - Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation. 2024-12-25 21:34:31 +08:00			`)`
Apply Ruff Corrections 2025-01-13 19:19:58 +08:00
Enhance crawler capabilities and documentation - Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation. 2024-12-25 21:34:31 +08:00			`html_strategy = LLMExtractionStrategy(`
			`input_format="html",`
Feat/llm config (#724) * feature: Add LlmConfig to easily configure and pass LLM configs to different strategies * pulled in next branch and resolved conflicts * feat: Add gemini and deepseek providers. Make ignore_cache in llm content filter to true by default to avoid confusions * Refactor: Update LlmConfig in LLMExtractionStrategy class and deprecate old params * updated tests, docs and readme 2025-02-21 13:11:37 +05:30			`llmConfig=LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")),`
Apply Ruff Corrections 2025-01-13 19:19:58 +08:00			`instruction="Extract product information from HTML including structured data",`
Enhance crawler capabilities and documentation - Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation. 2024-12-25 21:34:31 +08:00			`)`
Apply Ruff Corrections 2025-01-13 19:19:58 +08:00
Enhance crawler capabilities and documentation - Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation. 2024-12-25 21:34:31 +08:00			`fit_markdown_strategy = LLMExtractionStrategy(`
			`input_format="fit_markdown",`
Feat/llm config (#724) * feature: Add LlmConfig to easily configure and pass LLM configs to different strategies * pulled in next branch and resolved conflicts * feat: Add gemini and deepseek providers. Make ignore_cache in llm content filter to true by default to avoid confusions * Refactor: Update LlmConfig in LLMExtractionStrategy class and deprecate old params * updated tests, docs and readme 2025-02-21 13:11:37 +05:30			`llmConfig=LlmConfig(provider="openai/gpt-4o-mini",api_token=os.getenv("OPENAI_API_KEY")),`
Apply Ruff Corrections 2025-01-13 19:19:58 +08:00			`instruction="Extract product information from cleaned markdown",`
Enhance crawler capabilities and documentation - Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation. 2024-12-25 21:34:31 +08:00			`)`
Apply Ruff Corrections 2025-01-13 19:19:58 +08:00
Enhance crawler capabilities and documentation - Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation. 2024-12-25 21:34:31 +08:00			`# 2. JSON CSS Extraction (automatically uses HTML input)`
			`css_schema = {`
			`"baseSelector": ".product",`
			`"fields": [`
			`{"name": "title", "selector": "h1.product-title", "type": "text"},`
			`{"name": "price", "selector": ".price", "type": "text"},`
Apply Ruff Corrections 2025-01-13 19:19:58 +08:00			`{"name": "description", "selector": ".description", "type": "text"},`
			`],`
Enhance crawler capabilities and documentation - Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation. 2024-12-25 21:34:31 +08:00			`}`
			`css_strategy = JsonCssExtractionStrategy(schema=css_schema)`
Apply Ruff Corrections 2025-01-13 19:19:58 +08:00
Enhance crawler capabilities and documentation - Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation. 2024-12-25 21:34:31 +08:00			`# 3. JSON XPath Extraction (automatically uses HTML input)`
			`xpath_schema = {`
			`"baseSelector": "//div[@class='product']",`
			`"fields": [`
Apply Ruff Corrections 2025-01-13 19:19:58 +08:00			`{`
			`"name": "title",`
			`"selector": ".//h1[@class='product-title']/text()",`
			`"type": "text",`
			`},`
			`{`
			`"name": "price",`
			`"selector": ".//span[@class='price']/text()",`
			`"type": "text",`
			`},`
			`{`
			`"name": "description",`
			`"selector": ".//div[@class='description']/text()",`
			`"type": "text",`
			`},`
			`],`
Enhance crawler capabilities and documentation - Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation. 2024-12-25 21:34:31 +08:00			`}`
			`xpath_strategy = JsonXPathExtractionStrategy(schema=xpath_schema)`
Apply Ruff Corrections 2025-01-13 19:19:58 +08:00
Enhance crawler capabilities and documentation - Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation. 2024-12-25 21:34:31 +08:00			`# Use context manager for proper resource handling`
			`async with AsyncWebCrawler(config=browser_config) as crawler:`
			`# Run all strategies`
			`await run_extraction(crawler, url, markdown_strategy, "Markdown LLM")`
			`await run_extraction(crawler, url, html_strategy, "HTML LLM")`
			`await run_extraction(crawler, url, fit_markdown_strategy, "Fit Markdown LLM")`
			`await run_extraction(crawler, url, css_strategy, "CSS Extraction")`
			`await run_extraction(crawler, url, xpath_strategy, "XPath Extraction")`

Apply Ruff Corrections 2025-01-13 19:19:58 +08:00
Enhance crawler capabilities and documentation - Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation. 2024-12-25 21:34:31 +08:00			`if __name__ == "__main__":`
			`asyncio.run(main())`