crawl4ai/docs/examples/extraction_strategies_examples.py

126 lines
4.6 KiB
Python
Raw Normal View History

"""
Example demonstrating different extraction strategies with various input formats.
This example shows how to:
1. Use different input formats (markdown, HTML, fit_markdown)
2. Work with JSON-based extractors (CSS and XPath)
3. Use LLM-based extraction with different input formats
4. Configure browser and crawler settings properly
"""
import asyncio
import os
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.async_configs import LlmConfig
from crawl4ai.extraction_strategy import (
LLMExtractionStrategy,
JsonCssExtractionStrategy,
2025-01-13 19:19:58 +08:00
JsonXPathExtractionStrategy,
)
from crawl4ai.content_filter_strategy import PruningContentFilter
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
2025-01-13 19:19:58 +08:00
async def run_extraction(crawler: AsyncWebCrawler, url: str, strategy, name: str):
"""Helper function to run extraction with proper configuration"""
try:
# Configure the crawler run settings
config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
extraction_strategy=strategy,
markdown_generator=DefaultMarkdownGenerator(
content_filter=PruningContentFilter() # For fit_markdown support
2025-01-13 19:19:58 +08:00
),
)
2025-01-13 19:19:58 +08:00
# Run the crawler
result = await crawler.arun(url=url, config=config)
2025-01-13 19:19:58 +08:00
if result.success:
print(f"\n=== {name} Results ===")
print(f"Extracted Content: {result.extracted_content}")
Release prep (#749) * fix: Update export of URLPatternFilter * chore: Add dependancy for cchardet in requirements * docs: Update example for deep crawl in release note for v0.5 * Docs: update the example for memory dispatcher * docs: updated example for crawl strategies * Refactor: Removed wrapping in if __name__==main block since this is a markdown file. * chore: removed cchardet from dependancy list, since unclecode is planning to remove it * docs: updated the example for proxy rotation to a working example * feat: Introduced ProxyConfig param * Add tutorial for deep crawl & update contributor list for bug fixes in feb alpha-1 * chore: update and test new dependancies * feat:Make PyPDF2 a conditional dependancy * updated tutorial and release note for v0.5 * docs: update docs for deep crawl, and fix a typo in docker-deployment markdown filename * refactor: 1. Deprecate markdown_v2 2. Make markdown backward compatible to behave as a string when needed. 3. Fix LlmConfig usage in cli 4. Deprecate markdown_v2 in cli 5. Update AsyncWebCrawler for changes in CrawlResult * fix: Bug in serialisation of markdown in acache_url * Refactor: Added deprecation errors for fit_html and fit_markdown directly on markdown. Now access them via markdown * fix: remove deprecated markdown_v2 from docker * Refactor: remove deprecated fit_markdown and fit_html from result * refactor: fix cache retrieval for markdown as a string * chore: update all docs, examples and tests with deprecation announcements for markdown_v2, fit_html, fit_markdown
2025-02-28 17:23:35 +05:30
print(f"Raw Markdown Length: {len(result.markdown.raw_markdown)}")
2025-01-13 19:19:58 +08:00
print(
Release prep (#749) * fix: Update export of URLPatternFilter * chore: Add dependancy for cchardet in requirements * docs: Update example for deep crawl in release note for v0.5 * Docs: update the example for memory dispatcher * docs: updated example for crawl strategies * Refactor: Removed wrapping in if __name__==main block since this is a markdown file. * chore: removed cchardet from dependancy list, since unclecode is planning to remove it * docs: updated the example for proxy rotation to a working example * feat: Introduced ProxyConfig param * Add tutorial for deep crawl & update contributor list for bug fixes in feb alpha-1 * chore: update and test new dependancies * feat:Make PyPDF2 a conditional dependancy * updated tutorial and release note for v0.5 * docs: update docs for deep crawl, and fix a typo in docker-deployment markdown filename * refactor: 1. Deprecate markdown_v2 2. Make markdown backward compatible to behave as a string when needed. 3. Fix LlmConfig usage in cli 4. Deprecate markdown_v2 in cli 5. Update AsyncWebCrawler for changes in CrawlResult * fix: Bug in serialisation of markdown in acache_url * Refactor: Added deprecation errors for fit_html and fit_markdown directly on markdown. Now access them via markdown * fix: remove deprecated markdown_v2 from docker * Refactor: remove deprecated fit_markdown and fit_html from result * refactor: fix cache retrieval for markdown as a string * chore: update all docs, examples and tests with deprecation announcements for markdown_v2, fit_html, fit_markdown
2025-02-28 17:23:35 +05:30
f"Citations Markdown Length: {len(result.markdown.markdown_with_citations)}"
2025-01-13 19:19:58 +08:00
)
else:
print(f"Error in {name}: Crawl failed")
2025-01-13 19:19:58 +08:00
except Exception as e:
print(f"Error in {name}: {str(e)}")
2025-01-13 19:19:58 +08:00
async def main():
# Example URL (replace with actual URL)
url = "https://example.com/product-page"
2025-01-13 19:19:58 +08:00
# Configure browser settings
2025-01-13 19:19:58 +08:00
browser_config = BrowserConfig(headless=True, verbose=True)
# Initialize extraction strategies
2025-01-13 19:19:58 +08:00
# 1. LLM Extraction with different input formats
markdown_strategy = LLMExtractionStrategy(
llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")),
2025-01-13 19:19:58 +08:00
instruction="Extract product information including name, price, and description",
)
2025-01-13 19:19:58 +08:00
html_strategy = LLMExtractionStrategy(
input_format="html",
llmConfig=LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")),
2025-01-13 19:19:58 +08:00
instruction="Extract product information from HTML including structured data",
)
2025-01-13 19:19:58 +08:00
fit_markdown_strategy = LLMExtractionStrategy(
input_format="fit_markdown",
llmConfig=LlmConfig(provider="openai/gpt-4o-mini",api_token=os.getenv("OPENAI_API_KEY")),
2025-01-13 19:19:58 +08:00
instruction="Extract product information from cleaned markdown",
)
2025-01-13 19:19:58 +08:00
# 2. JSON CSS Extraction (automatically uses HTML input)
css_schema = {
"baseSelector": ".product",
"fields": [
{"name": "title", "selector": "h1.product-title", "type": "text"},
{"name": "price", "selector": ".price", "type": "text"},
2025-01-13 19:19:58 +08:00
{"name": "description", "selector": ".description", "type": "text"},
],
}
css_strategy = JsonCssExtractionStrategy(schema=css_schema)
2025-01-13 19:19:58 +08:00
# 3. JSON XPath Extraction (automatically uses HTML input)
xpath_schema = {
"baseSelector": "//div[@class='product']",
"fields": [
2025-01-13 19:19:58 +08:00
{
"name": "title",
"selector": ".//h1[@class='product-title']/text()",
"type": "text",
},
{
"name": "price",
"selector": ".//span[@class='price']/text()",
"type": "text",
},
{
"name": "description",
"selector": ".//div[@class='description']/text()",
"type": "text",
},
],
}
xpath_strategy = JsonXPathExtractionStrategy(schema=xpath_schema)
2025-01-13 19:19:58 +08:00
# Use context manager for proper resource handling
async with AsyncWebCrawler(config=browser_config) as crawler:
# Run all strategies
await run_extraction(crawler, url, markdown_strategy, "Markdown LLM")
await run_extraction(crawler, url, html_strategy, "HTML LLM")
await run_extraction(crawler, url, fit_markdown_strategy, "Fit Markdown LLM")
await run_extraction(crawler, url, css_strategy, "CSS Extraction")
await run_extraction(crawler, url, xpath_strategy, "XPath Extraction")
2025-01-13 19:19:58 +08:00
if __name__ == "__main__":
asyncio.run(main())