2025-02-04 23:28:17 +08:00
|
|
|
import asyncio
|
|
|
|
import time
|
|
|
|
|
|
|
|
|
2025-02-05 22:50:39 +08:00
|
|
|
from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode
|
|
|
|
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
|
|
|
|
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
|
|
|
|
# from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
|
2025-02-04 23:28:17 +08:00
|
|
|
|
|
|
|
|
|
|
|
async def main():
|
|
|
|
"""Example deep crawl of documentation site."""
|
|
|
|
config = CrawlerRunConfig(
|
|
|
|
deep_crawl_strategy = BFSDeepCrawlStrategy(
|
|
|
|
max_depth=2,
|
|
|
|
include_external=False
|
|
|
|
),
|
|
|
|
stream=False,
|
|
|
|
verbose=True,
|
2025-02-05 22:50:39 +08:00
|
|
|
cache_mode=CacheMode.BYPASS,
|
|
|
|
scraping_strategy=LXMLWebScrapingStrategy()
|
2025-02-04 23:28:17 +08:00
|
|
|
)
|
|
|
|
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
|
|
start_time = time.perf_counter()
|
|
|
|
print("\nStarting deep crawl in batch mode:")
|
|
|
|
results = await crawler.arun(
|
|
|
|
url="https://docs.crawl4ai.com",
|
|
|
|
config=config
|
|
|
|
)
|
|
|
|
print(f"Crawled {len(results)} pages")
|
|
|
|
print(f"Example page: {results[0].url}")
|
|
|
|
print(f"Duration: {time.perf_counter() - start_time:.2f} seconds\n")
|
|
|
|
|
|
|
|
print("Starting deep crawl in streaming mode:")
|
|
|
|
config.stream = True
|
|
|
|
start_time = time.perf_counter()
|
|
|
|
async for result in await crawler.arun(
|
|
|
|
url="https://docs.crawl4ai.com",
|
|
|
|
config=config
|
|
|
|
):
|
|
|
|
print(f"→ {result.url} (Depth: {result.metadata.get('depth', 0)})")
|
|
|
|
print(f"Duration: {time.perf_counter() - start_time:.2f} seconds")
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
asyncio.run(main())
|