crawl4ai/tests/test_scraping_strategy.py

import nest_asyncio
nest_asyncio.apply()

import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LXMLWebScrapingStrategy, CacheMode

async def main():
    config = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS,
        scraping_strategy=LXMLWebScrapingStrategy()  # Faster alternative to default BeautifulSoup
    )
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(
            url="https://example.com", 
            config=config
        )
        print(f"Success: {result.success}")
        print(f"Markdown length: {len(result.markdown_v2.raw_markdown)}")

if __name__ == "__main__":
    asyncio.run(main())
refactor(scraping): replace ScrapingMode enum with strategy pattern Replace the ScrapingMode enum with a proper strategy pattern implementation for content scraping. This change introduces: - New ContentScrapingStrategy abstract base class - Concrete WebScrapingStrategy and LXMLWebScrapingStrategy implementations - New Pydantic models for structured scraping results - Updated documentation reflecting the new strategy-based approach BREAKING CHANGE: ScrapingMode enum has been removed. Users should now use ContentScrapingStrategy implementations instead. 2025-01-13 17:53:12 +08:00			`import nest_asyncio`
			`nest_asyncio.apply()`

			`import asyncio`
			`from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LXMLWebScrapingStrategy, CacheMode`

			`async def main():`
			`config = CrawlerRunConfig(`
			`cache_mode=CacheMode.BYPASS,`
			`scraping_strategy=LXMLWebScrapingStrategy() # Faster alternative to default BeautifulSoup`
			`)`
			`async with AsyncWebCrawler() as crawler:`
			`result = await crawler.arun(`
			`url="https://example.com",`
			`config=config`
			`)`
			`print(f"Success: {result.success}")`
			`print(f"Markdown length: {len(result.markdown_v2.raw_markdown)}")`

			`if __name__ == "__main__":`
			`asyncio.run(main())`