crawl4ai/docs/examples/extraction_strategies_examples.py

126 lines
4.6 KiB
Python
Raw Normal View History

"""
Example demonstrating different extraction strategies with various input formats.
This example shows how to:
1. Use different input formats (markdown, HTML, fit_markdown)
2. Work with JSON-based extractors (CSS and XPath)
3. Use LLM-based extraction with different input formats
4. Configure browser and crawler settings properly
"""
import asyncio
import os
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai import LLMConfig
from crawl4ai.extraction_strategy import (
LLMExtractionStrategy,
JsonCssExtractionStrategy,
2025-01-13 19:19:58 +08:00
JsonXPathExtractionStrategy,
)
from crawl4ai.content_filter_strategy import PruningContentFilter
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
2025-01-13 19:19:58 +08:00
async def run_extraction(crawler: AsyncWebCrawler, url: str, strategy, name: str):
"""Helper function to run extraction with proper configuration"""
try:
# Configure the crawler run settings
config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
extraction_strategy=strategy,
markdown_generator=DefaultMarkdownGenerator(
content_filter=PruningContentFilter() # For fit_markdown support
2025-01-13 19:19:58 +08:00
),
)
2025-01-13 19:19:58 +08:00
# Run the crawler
result = await crawler.arun(url=url, config=config)
2025-01-13 19:19:58 +08:00
if result.success:
print(f"\n=== {name} Results ===")
print(f"Extracted Content: {result.extracted_content}")
Release prep (#749) * fix: Update export of URLPatternFilter * chore: Add dependancy for cchardet in requirements * docs: Update example for deep crawl in release note for v0.5 * Docs: update the example for memory dispatcher * docs: updated example for crawl strategies * Refactor: Removed wrapping in if __name__==main block since this is a markdown file. * chore: removed cchardet from dependancy list, since unclecode is planning to remove it * docs: updated the example for proxy rotation to a working example * feat: Introduced ProxyConfig param * Add tutorial for deep crawl & update contributor list for bug fixes in feb alpha-1 * chore: update and test new dependancies * feat:Make PyPDF2 a conditional dependancy * updated tutorial and release note for v0.5 * docs: update docs for deep crawl, and fix a typo in docker-deployment markdown filename * refactor: 1. Deprecate markdown_v2 2. Make markdown backward compatible to behave as a string when needed. 3. Fix LlmConfig usage in cli 4. Deprecate markdown_v2 in cli 5. Update AsyncWebCrawler for changes in CrawlResult * fix: Bug in serialisation of markdown in acache_url * Refactor: Added deprecation errors for fit_html and fit_markdown directly on markdown. Now access them via markdown * fix: remove deprecated markdown_v2 from docker * Refactor: remove deprecated fit_markdown and fit_html from result * refactor: fix cache retrieval for markdown as a string * chore: update all docs, examples and tests with deprecation announcements for markdown_v2, fit_html, fit_markdown
2025-02-28 17:23:35 +05:30
print(f"Raw Markdown Length: {len(result.markdown.raw_markdown)}")
2025-01-13 19:19:58 +08:00
print(
Release prep (#749) * fix: Update export of URLPatternFilter * chore: Add dependancy for cchardet in requirements * docs: Update example for deep crawl in release note for v0.5 * Docs: update the example for memory dispatcher * docs: updated example for crawl strategies * Refactor: Removed wrapping in if __name__==main block since this is a markdown file. * chore: removed cchardet from dependancy list, since unclecode is planning to remove it * docs: updated the example for proxy rotation to a working example * feat: Introduced ProxyConfig param * Add tutorial for deep crawl & update contributor list for bug fixes in feb alpha-1 * chore: update and test new dependancies * feat:Make PyPDF2 a conditional dependancy * updated tutorial and release note for v0.5 * docs: update docs for deep crawl, and fix a typo in docker-deployment markdown filename * refactor: 1. Deprecate markdown_v2 2. Make markdown backward compatible to behave as a string when needed. 3. Fix LlmConfig usage in cli 4. Deprecate markdown_v2 in cli 5. Update AsyncWebCrawler for changes in CrawlResult * fix: Bug in serialisation of markdown in acache_url * Refactor: Added deprecation errors for fit_html and fit_markdown directly on markdown. Now access them via markdown * fix: remove deprecated markdown_v2 from docker * Refactor: remove deprecated fit_markdown and fit_html from result * refactor: fix cache retrieval for markdown as a string * chore: update all docs, examples and tests with deprecation announcements for markdown_v2, fit_html, fit_markdown
2025-02-28 17:23:35 +05:30
f"Citations Markdown Length: {len(result.markdown.markdown_with_citations)}"
2025-01-13 19:19:58 +08:00
)
else:
print(f"Error in {name}: Crawl failed")
2025-01-13 19:19:58 +08:00
except Exception as e:
print(f"Error in {name}: {str(e)}")
2025-01-13 19:19:58 +08:00
async def main():
# Example URL (replace with actual URL)
url = "https://example.com/product-page"
2025-01-13 19:19:58 +08:00
# Configure browser settings
2025-01-13 19:19:58 +08:00
browser_config = BrowserConfig(headless=True, verbose=True)
# Initialize extraction strategies
2025-01-13 19:19:58 +08:00
# 1. LLM Extraction with different input formats
markdown_strategy = LLMExtractionStrategy(
llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")),
2025-01-13 19:19:58 +08:00
instruction="Extract product information including name, price, and description",
)
2025-01-13 19:19:58 +08:00
html_strategy = LLMExtractionStrategy(
input_format="html",
llm_config=LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")),
2025-01-13 19:19:58 +08:00
instruction="Extract product information from HTML including structured data",
)
2025-01-13 19:19:58 +08:00
fit_markdown_strategy = LLMExtractionStrategy(
input_format="fit_markdown",
llm_config=LLMConfig(provider="openai/gpt-4o-mini",api_token=os.getenv("OPENAI_API_KEY")),
2025-01-13 19:19:58 +08:00
instruction="Extract product information from cleaned markdown",
)
2025-01-13 19:19:58 +08:00
# 2. JSON CSS Extraction (automatically uses HTML input)
css_schema = {
"baseSelector": ".product",
"fields": [
{"name": "title", "selector": "h1.product-title", "type": "text"},
{"name": "price", "selector": ".price", "type": "text"},
2025-01-13 19:19:58 +08:00
{"name": "description", "selector": ".description", "type": "text"},
],
}
css_strategy = JsonCssExtractionStrategy(schema=css_schema)
2025-01-13 19:19:58 +08:00
# 3. JSON XPath Extraction (automatically uses HTML input)
xpath_schema = {
"baseSelector": "//div[@class='product']",
"fields": [
2025-01-13 19:19:58 +08:00
{
"name": "title",
"selector": ".//h1[@class='product-title']/text()",
"type": "text",
},
{
"name": "price",
"selector": ".//span[@class='price']/text()",
"type": "text",
},
{
"name": "description",
"selector": ".//div[@class='description']/text()",
"type": "text",
},
],
}
xpath_strategy = JsonXPathExtractionStrategy(schema=xpath_schema)
2025-01-13 19:19:58 +08:00
# Use context manager for proper resource handling
async with AsyncWebCrawler(config=browser_config) as crawler:
# Run all strategies
await run_extraction(crawler, url, markdown_strategy, "Markdown LLM")
await run_extraction(crawler, url, html_strategy, "HTML LLM")
await run_extraction(crawler, url, fit_markdown_strategy, "Fit Markdown LLM")
await run_extraction(crawler, url, css_strategy, "CSS Extraction")
await run_extraction(crawler, url, xpath_strategy, "XPath Extraction")
2025-01-13 19:19:58 +08:00
if __name__ == "__main__":
asyncio.run(main())