crawl4ai/docs/examples/markdown/content_source_short_example.py
UncleCode 7db6b468d9 feat(markdown): add content source selection for markdown generation
Adds a new content_source parameter to MarkdownGenerationStrategy that allows
selecting which HTML content to use for markdown generation:
- cleaned_html (default): uses post-processed HTML
- raw_html: uses original webpage HTML
- fit_html: uses preprocessed HTML for schema extraction

Changes include:
- Added content_source parameter to MarkdownGenerationStrategy
- Updated AsyncWebCrawler to handle HTML source selection
- Added examples and tests for the new feature
- Updated documentation with new parameter details

BREAKING CHANGE: Renamed cleaned_html parameter to input_html in generate_markdown()
method signature to better reflect its generalized purpose
2025-04-17 20:13:53 +08:00

42 lines
1.9 KiB
Python

"""
Example demonstrating how to use the content_source parameter in MarkdownGenerationStrategy
"""
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator
async def demo_markdown_source_config():
print("\n=== Demo: Configuring Markdown Source ===")
# Example 1: Generate markdown from cleaned HTML (default behavior)
cleaned_md_generator = DefaultMarkdownGenerator(content_source="cleaned_html")
config_cleaned = CrawlerRunConfig(markdown_generator=cleaned_md_generator)
async with AsyncWebCrawler() as crawler:
result_cleaned = await crawler.arun(url="https://example.com", config=config_cleaned)
print("Markdown from Cleaned HTML (default):")
print(f" Length: {len(result_cleaned.markdown.raw_markdown)}")
print(f" Start: {result_cleaned.markdown.raw_markdown[:100]}...")
# Example 2: Generate markdown directly from raw HTML
raw_md_generator = DefaultMarkdownGenerator(content_source="raw_html")
config_raw = CrawlerRunConfig(markdown_generator=raw_md_generator)
async with AsyncWebCrawler() as crawler:
result_raw = await crawler.arun(url="https://example.com", config=config_raw)
print("\nMarkdown from Raw HTML:")
print(f" Length: {len(result_raw.markdown.raw_markdown)}")
print(f" Start: {result_raw.markdown.raw_markdown[:100]}...")
# Example 3: Generate markdown from preprocessed 'fit' HTML
fit_md_generator = DefaultMarkdownGenerator(content_source="fit_html")
config_fit = CrawlerRunConfig(markdown_generator=fit_md_generator)
async with AsyncWebCrawler() as crawler:
result_fit = await crawler.arun(url="https://example.com", config=config_fit)
print("\nMarkdown from Fit HTML:")
print(f" Length: {len(result_fit.markdown.raw_markdown)}")
print(f" Start: {result_fit.markdown.raw_markdown[:100]}...")
if __name__ == "__main__":
asyncio.run(demo_markdown_source_config())