
Adds a new content_source parameter to MarkdownGenerationStrategy that allows selecting which HTML content to use for markdown generation: - cleaned_html (default): uses post-processed HTML - raw_html: uses original webpage HTML - fit_html: uses preprocessed HTML for schema extraction Changes include: - Added content_source parameter to MarkdownGenerationStrategy - Updated AsyncWebCrawler to handle HTML source selection - Added examples and tests for the new feature - Updated documentation with new parameter details BREAKING CHANGE: Renamed cleaned_html parameter to input_html in generate_markdown() method signature to better reflect its generalized purpose
42 lines
1.9 KiB
Python
42 lines
1.9 KiB
Python
"""
|
|
Example demonstrating how to use the content_source parameter in MarkdownGenerationStrategy
|
|
"""
|
|
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator
|
|
|
|
async def demo_markdown_source_config():
|
|
print("\n=== Demo: Configuring Markdown Source ===")
|
|
|
|
# Example 1: Generate markdown from cleaned HTML (default behavior)
|
|
cleaned_md_generator = DefaultMarkdownGenerator(content_source="cleaned_html")
|
|
config_cleaned = CrawlerRunConfig(markdown_generator=cleaned_md_generator)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
result_cleaned = await crawler.arun(url="https://example.com", config=config_cleaned)
|
|
print("Markdown from Cleaned HTML (default):")
|
|
print(f" Length: {len(result_cleaned.markdown.raw_markdown)}")
|
|
print(f" Start: {result_cleaned.markdown.raw_markdown[:100]}...")
|
|
|
|
# Example 2: Generate markdown directly from raw HTML
|
|
raw_md_generator = DefaultMarkdownGenerator(content_source="raw_html")
|
|
config_raw = CrawlerRunConfig(markdown_generator=raw_md_generator)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
result_raw = await crawler.arun(url="https://example.com", config=config_raw)
|
|
print("\nMarkdown from Raw HTML:")
|
|
print(f" Length: {len(result_raw.markdown.raw_markdown)}")
|
|
print(f" Start: {result_raw.markdown.raw_markdown[:100]}...")
|
|
|
|
# Example 3: Generate markdown from preprocessed 'fit' HTML
|
|
fit_md_generator = DefaultMarkdownGenerator(content_source="fit_html")
|
|
config_fit = CrawlerRunConfig(markdown_generator=fit_md_generator)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
result_fit = await crawler.arun(url="https://example.com", config=config_fit)
|
|
print("\nMarkdown from Fit HTML:")
|
|
print(f" Length: {len(result_fit.markdown.raw_markdown)}")
|
|
print(f" Start: {result_fit.markdown.raw_markdown[:100]}...")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(demo_markdown_source_config()) |