crawl4ai/tests/general/test_advanced_deep_crawl.py

import asyncio
import time


from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter, DomainFilter, ContentTypeFilter, ContentRelevanceFilter
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
# from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy


async def main():
    """Example deep crawl of documentation site."""
    filter_chain = FilterChain([
        URLPatternFilter(patterns=["*2025*"]),
        DomainFilter(allowed_domains=["techcrunch.com"]),
        ContentRelevanceFilter(query="Use of artificial intelligence in Defence applications", threshold=1),
        ContentTypeFilter(allowed_types=["text/html","application/javascript"])
    ])
    config = CrawlerRunConfig(
        deep_crawl_strategy = BestFirstCrawlingStrategy(
            max_depth=2,
            include_external=False,
            filter_chain=filter_chain,
            url_scorer=KeywordRelevanceScorer(keywords=["anduril", "defence", "AI"]),
        ),
        stream=False,
        verbose=True,
        cache_mode=CacheMode.BYPASS,
        scraping_strategy=LXMLWebScrapingStrategy()
    )

    async with AsyncWebCrawler() as crawler:
        print("Starting deep crawl in streaming mode:")
        config.stream = True
        start_time = time.perf_counter()
        async for result in await crawler.arun(
            url="https://techcrunch.com",
            config=config
        ):
            print(f"→ {result.url} (Depth: {result.metadata.get('depth', 0)})")
        print(f"Duration: {time.perf_counter() - start_time:.2f} seconds")

if __name__ == "__main__":
    asyncio.run(main())
2025 feb alpha 1 (#685) * spelling change in prompt * gpt-4o-mini support * Remove leading Y before here * prompt spell correction * (Docs) Fix numbered list end-of-line formatting Added the missing "two spaces" to add a line break * fix: access downloads_path through browser_config in _handle_download method - Fixes #585 * crawl * fix: https://github.com/unclecode/crawl4ai/issues/592 * fix: https://github.com/unclecode/crawl4ai/issues/583 * Docs update: https://github.com/unclecode/crawl4ai/issues/649 * fix: https://github.com/unclecode/crawl4ai/issues/570 * Docs: updated example for content-selection to reflect new changes in yc newsfeed css * Refactor: Removed old filters and replaced with optimised filters * fix:Fixed imports as per the new names of filters * Tests: For deep crawl filters * Refactor: Remove old scorers and replace with optimised ones: Fix imports forall filters and scorers. * fix: awaiting on filters that are async in nature eg: content relevance and seo filters * fix: https://github.com/unclecode/crawl4ai/issues/592 * fix: https://github.com/unclecode/crawl4ai/issues/715 --------- Co-authored-by: DarshanTank <darshan.tank@gnani.ai> Co-authored-by: Tuhin Mallick <tuhin.mllk@gmail.com> Co-authored-by: Serhat Soydan <ssoydan@gmail.com> Co-authored-by: cardit1 <maneesh@cardit.in> Co-authored-by: Tautik Agrahari <tautikagrahari@gmail.com> 2025-02-19 11:43:17 +05:30			`import asyncio`
			`import time`


			`from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode`
			`from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy`
			`from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy`
			`from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter, DomainFilter, ContentTypeFilter, ContentRelevanceFilter`
			`from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer`
			`# from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy`


			`async def main():`
			`"""Example deep crawl of documentation site."""`
			`filter_chain = FilterChain([`
			`URLPatternFilter(patterns=["2025"]),`
			`DomainFilter(allowed_domains=["techcrunch.com"]),`
			`ContentRelevanceFilter(query="Use of artificial intelligence in Defence applications", threshold=1),`
			`ContentTypeFilter(allowed_types=["text/html","application/javascript"])`
			`])`
			`config = CrawlerRunConfig(`
			`deep_crawl_strategy = BestFirstCrawlingStrategy(`
			`max_depth=2,`
			`include_external=False,`
			`filter_chain=filter_chain,`
			`url_scorer=KeywordRelevanceScorer(keywords=["anduril", "defence", "AI"]),`
			`),`
			`stream=False,`
			`verbose=True,`
			`cache_mode=CacheMode.BYPASS,`
			`scraping_strategy=LXMLWebScrapingStrategy()`
			`)`

			`async with AsyncWebCrawler() as crawler:`
			`print("Starting deep crawl in streaming mode:")`
			`config.stream = True`
			`start_time = time.perf_counter()`
			`async for result in await crawler.arun(`
			`url="https://techcrunch.com",`
			`config=config`
			`):`
			`print(f"→ {result.url} (Depth: {result.metadata.get('depth', 0)})")`
			`print(f"Duration: {time.perf_counter() - start_time:.2f} seconds")`

			`if __name__ == "__main__":`
			`asyncio.run(main())`