
Adds a new content_source parameter to MarkdownGenerationStrategy that allows selecting which HTML content to use for markdown generation: - cleaned_html (default): uses post-processed HTML - raw_html: uses original webpage HTML - fit_html: uses preprocessed HTML for schema extraction Changes include: - Added content_source parameter to MarkdownGenerationStrategy - Updated AsyncWebCrawler to handle HTML source selection - Added examples and tests for the new feature - Updated documentation with new parameter details BREAKING CHANGE: Renamed cleaned_html parameter to input_html in generate_markdown() method signature to better reflect its generalized purpose
64 lines
2.8 KiB
Python
64 lines
2.8 KiB
Python
"""
|
|
Example showing how to use the content_source parameter to control HTML input for markdown generation.
|
|
"""
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator
|
|
|
|
async def demo_content_source():
|
|
"""Demonstrates different content_source options for markdown generation."""
|
|
url = "https://example.com" # Simple demo site
|
|
|
|
print("Crawling with different content_source options...")
|
|
|
|
# --- Example 1: Default Behavior (cleaned_html) ---
|
|
# This uses the HTML after it has been processed by the scraping strategy
|
|
# The HTML is cleaned, simplified, and optimized for readability
|
|
default_generator = DefaultMarkdownGenerator() # content_source="cleaned_html" is default
|
|
default_config = CrawlerRunConfig(markdown_generator=default_generator)
|
|
|
|
# --- Example 2: Raw HTML ---
|
|
# This uses the original HTML directly from the webpage
|
|
# Preserves more original content but may include navigation, ads, etc.
|
|
raw_generator = DefaultMarkdownGenerator(content_source="raw_html")
|
|
raw_config = CrawlerRunConfig(markdown_generator=raw_generator)
|
|
|
|
# --- Example 3: Fit HTML ---
|
|
# This uses preprocessed HTML optimized for schema extraction
|
|
# Better for structured data extraction but may lose some formatting
|
|
fit_generator = DefaultMarkdownGenerator(content_source="fit_html")
|
|
fit_config = CrawlerRunConfig(markdown_generator=fit_generator)
|
|
|
|
# Execute all three crawlers in sequence
|
|
async with AsyncWebCrawler() as crawler:
|
|
# Default (cleaned_html)
|
|
result_default = await crawler.arun(url=url, config=default_config)
|
|
|
|
# Raw HTML
|
|
result_raw = await crawler.arun(url=url, config=raw_config)
|
|
|
|
# Fit HTML
|
|
result_fit = await crawler.arun(url=url, config=fit_config)
|
|
|
|
# Print a summary of the results
|
|
print("\nMarkdown Generation Results:\n")
|
|
|
|
print("1. Default (cleaned_html):")
|
|
print(f" Length: {len(result_default.markdown.raw_markdown)} chars")
|
|
print(f" First 80 chars: {result_default.markdown.raw_markdown[:80]}...\n")
|
|
|
|
print("2. Raw HTML:")
|
|
print(f" Length: {len(result_raw.markdown.raw_markdown)} chars")
|
|
print(f" First 80 chars: {result_raw.markdown.raw_markdown[:80]}...\n")
|
|
|
|
print("3. Fit HTML:")
|
|
print(f" Length: {len(result_fit.markdown.raw_markdown)} chars")
|
|
print(f" First 80 chars: {result_fit.markdown.raw_markdown[:80]}...\n")
|
|
|
|
# Demonstrate differences in output
|
|
print("\nKey Takeaways:")
|
|
print("- cleaned_html: Best for readable, focused content")
|
|
print("- raw_html: Preserves more original content, but may include noise")
|
|
print("- fit_html: Optimized for schema extraction and structured data")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(demo_content_source()) |