2025-01-13 17:53:12 +08:00
|
|
|
import nest_asyncio
|
2025-01-13 19:19:58 +08:00
|
|
|
|
2025-01-13 17:53:12 +08:00
|
|
|
nest_asyncio.apply()
|
|
|
|
|
|
|
|
import asyncio
|
2025-01-13 19:19:58 +08:00
|
|
|
from crawl4ai import (
|
|
|
|
AsyncWebCrawler,
|
|
|
|
CrawlerRunConfig,
|
|
|
|
LXMLWebScrapingStrategy,
|
|
|
|
CacheMode,
|
|
|
|
)
|
|
|
|
|
2025-01-13 17:53:12 +08:00
|
|
|
|
|
|
|
async def main():
|
|
|
|
config = CrawlerRunConfig(
|
|
|
|
cache_mode=CacheMode.BYPASS,
|
2025-01-13 19:19:58 +08:00
|
|
|
scraping_strategy=LXMLWebScrapingStrategy(), # Faster alternative to default BeautifulSoup
|
2025-01-13 17:53:12 +08:00
|
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
2025-01-13 19:19:58 +08:00
|
|
|
result = await crawler.arun(url="https://example.com", config=config)
|
2025-01-13 17:53:12 +08:00
|
|
|
print(f"Success: {result.success}")
|
2025-02-28 17:23:35 +05:30
|
|
|
print(f"Markdown length: {len(result.markdown.raw_markdown)}")
|
2025-01-13 17:53:12 +08:00
|
|
|
|
2025-01-13 19:19:58 +08:00
|
|
|
|
2025-01-13 17:53:12 +08:00
|
|
|
if __name__ == "__main__":
|
|
|
|
asyncio.run(main())
|