""" Example demonstrating different extraction strategies with various input formats. This example shows how to: 1. Use different input formats (markdown, HTML, fit_markdown) 2. Work with JSON-based extractors (CSS and XPath) 3. Use LLM-based extraction with different input formats 4. Configure browser and crawler settings properly """ import asyncio import os from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode from crawl4ai.async_configs import LlmConfig from crawl4ai.extraction_strategy import ( LLMExtractionStrategy, JsonCssExtractionStrategy, JsonXPathExtractionStrategy, ) from crawl4ai.content_filter_strategy import PruningContentFilter from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator async def run_extraction(crawler: AsyncWebCrawler, url: str, strategy, name: str): """Helper function to run extraction with proper configuration""" try: # Configure the crawler run settings config = CrawlerRunConfig( cache_mode=CacheMode.BYPASS, extraction_strategy=strategy, markdown_generator=DefaultMarkdownGenerator( content_filter=PruningContentFilter() # For fit_markdown support ), ) # Run the crawler result = await crawler.arun(url=url, config=config) if result.success: print(f"\n=== {name} Results ===") print(f"Extracted Content: {result.extracted_content}") print(f"Raw Markdown Length: {len(result.markdown_v2.raw_markdown)}") print( f"Citations Markdown Length: {len(result.markdown_v2.markdown_with_citations)}" ) else: print(f"Error in {name}: Crawl failed") except Exception as e: print(f"Error in {name}: {str(e)}") async def main(): # Example URL (replace with actual URL) url = "https://example.com/product-page" # Configure browser settings browser_config = BrowserConfig(headless=True, verbose=True) # Initialize extraction strategies # 1. LLM Extraction with different input formats markdown_strategy = LLMExtractionStrategy( llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")), instruction="Extract product information including name, price, and description", ) html_strategy = LLMExtractionStrategy( input_format="html", llmConfig=LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")), instruction="Extract product information from HTML including structured data", ) fit_markdown_strategy = LLMExtractionStrategy( input_format="fit_markdown", llmConfig=LlmConfig(provider="openai/gpt-4o-mini",api_token=os.getenv("OPENAI_API_KEY")), instruction="Extract product information from cleaned markdown", ) # 2. JSON CSS Extraction (automatically uses HTML input) css_schema = { "baseSelector": ".product", "fields": [ {"name": "title", "selector": "h1.product-title", "type": "text"}, {"name": "price", "selector": ".price", "type": "text"}, {"name": "description", "selector": ".description", "type": "text"}, ], } css_strategy = JsonCssExtractionStrategy(schema=css_schema) # 3. JSON XPath Extraction (automatically uses HTML input) xpath_schema = { "baseSelector": "//div[@class='product']", "fields": [ { "name": "title", "selector": ".//h1[@class='product-title']/text()", "type": "text", }, { "name": "price", "selector": ".//span[@class='price']/text()", "type": "text", }, { "name": "description", "selector": ".//div[@class='description']/text()", "type": "text", }, ], } xpath_strategy = JsonXPathExtractionStrategy(schema=xpath_schema) # Use context manager for proper resource handling async with AsyncWebCrawler(config=browser_config) as crawler: # Run all strategies await run_extraction(crawler, url, markdown_strategy, "Markdown LLM") await run_extraction(crawler, url, html_strategy, "HTML LLM") await run_extraction(crawler, url, fit_markdown_strategy, "Fit Markdown LLM") await run_extraction(crawler, url, css_strategy, "CSS Extraction") await run_extraction(crawler, url, xpath_strategy, "XPath Extraction") if __name__ == "__main__": asyncio.run(main())