
Add new features to enhance browser automation and HTML extraction: - Add CDP browser launch capability with customizable ports and profiles - Implement JsonLxmlExtractionStrategy for faster HTML parsing - Add CLI command 'crwl cdp' for launching standalone CDP browsers - Support connecting to external CDP browsers via URL - Optimize selector caching and context-sensitive queries BREAKING CHANGE: LLMConfig import path changed from crawl4ai.types to crawl4ai
87 lines
3.4 KiB
Python
87 lines
3.4 KiB
Python
import os
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
|
from crawl4ai import LLMConfig
|
|
from crawl4ai.content_filter_strategy import LLMContentFilter
|
|
|
|
async def test_llm_filter():
|
|
# Create an HTML source that needs intelligent filtering
|
|
url = "https://docs.python.org/3/tutorial/classes.html"
|
|
|
|
browser_config = BrowserConfig(
|
|
headless=True,
|
|
verbose=True
|
|
)
|
|
|
|
# run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
|
run_config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
# First get the raw HTML
|
|
result = await crawler.arun(url, config=run_config)
|
|
html = result.cleaned_html
|
|
|
|
# Initialize LLM filter with focused instruction
|
|
filter = LLMContentFilter(
|
|
llm_config=LLMConfig(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY')),
|
|
instruction="""
|
|
Focus on extracting the core educational content about Python classes.
|
|
Include:
|
|
- Key concepts and their explanations
|
|
- Important code examples
|
|
- Essential technical details
|
|
Exclude:
|
|
- Navigation elements
|
|
- Sidebars
|
|
- Footer content
|
|
- Version information
|
|
- Any non-essential UI elements
|
|
|
|
Format the output as clean markdown with proper code blocks and headers.
|
|
""",
|
|
verbose=True
|
|
)
|
|
|
|
filter = LLMContentFilter(
|
|
llm_config=LLMConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')),
|
|
chunk_token_threshold=2 ** 12 * 2, # 2048 * 2
|
|
ignore_cache = True,
|
|
instruction="""
|
|
Extract the main educational content while preserving its original wording and substance completely. Your task is to:
|
|
|
|
1. Maintain the exact language and terminology used in the main content
|
|
2. Keep all technical explanations, examples, and educational content intact
|
|
3. Preserve the original flow and structure of the core content
|
|
4. Remove only clearly irrelevant elements like:
|
|
- Navigation menus
|
|
- Advertisement sections
|
|
- Cookie notices
|
|
- Footers with site information
|
|
- Sidebars with external links
|
|
- Any UI elements that don't contribute to learning
|
|
|
|
The goal is to create a clean markdown version that reads exactly like the original article,
|
|
keeping all valuable content but free from distracting elements. Imagine you're creating
|
|
a perfect reading experience where nothing valuable is lost, but all noise is removed.
|
|
""",
|
|
verbose=True
|
|
)
|
|
|
|
# Apply filtering
|
|
filtered_content = filter.filter_content(html)
|
|
|
|
# Show results
|
|
print("\nFiltered Content Length:", len(filtered_content))
|
|
print("\nFirst 500 chars of filtered content:")
|
|
if filtered_content:
|
|
print(filtered_content[0][:500])
|
|
|
|
# Save on disc the markdown version
|
|
with open("filtered_content.md", "w", encoding="utf-8") as f:
|
|
f.write("\n".join(filtered_content))
|
|
|
|
# Show token usage
|
|
filter.show_usage()
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(test_llm_filter()) |