crawl4ai/tests/20241401/test_stream.py

import os, sys
# append 2 parent directories to sys.path to import crawl4ai
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(parent_dir)
parent_parent_dir = os.path.dirname(parent_dir)
sys.path.append(parent_parent_dir)

import asyncio
from crawl4ai import *

async def test_crawler():
    # Setup configurations
    browser_config = BrowserConfig(headless=True, verbose=False)
    crawler_config = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS,
        markdown_generator=DefaultMarkdownGenerator(
            content_filter=PruningContentFilter(
                threshold=0.48, 
                threshold_type="fixed", 
                min_word_threshold=0
            )
        ),
    )

    # Test URLs - mix of different sites
    urls = [
        "http://example.com",
        "http://example.org",
        "http://example.net",
    ] * 10  # 15 total URLs

    async with AsyncWebCrawler(config=browser_config) as crawler:
        print("\n=== Testing Streaming Mode ===")
        async for result in await crawler.arun_many(
            urls=urls,
            config=crawler_config,
            stream=True,
            verbose=True
        ):
            print(f"Received result for: {result.url} - Success: {result.success}")
            
        print("\n=== Testing Batch Mode ===")
        results = await crawler.arun_many(
            urls=urls,
            config=crawler_config,
            stream=False,
            verbose=True
        )
        print(f"Received all {len(results)} results at once")
        for result in results:
            print(f"Batch result for: {result.url} - Success: {result.success}")

if __name__ == "__main__":
    asyncio.run(test_crawler())
feat(dispatcher): add streaming support for URL processing Add new streaming capability to the MemoryAdaptiveDispatcher and AsyncWebCrawler to allow processing URLs with real-time result streaming. This enables processing results as they become available rather than waiting for all URLs to complete. Key changes: - Add run_urls_stream method to MemoryAdaptiveDispatcher - Update AsyncWebCrawler.arun_many to support streaming mode - Add result queue for better result handling - Improve type hints and documentation BREAKING CHANGE: The return type of arun_many now depends on the 'stream' parameter, returning either List[CrawlResult] or AsyncGenerator[CrawlResult, None] 2025-01-19 14:03:34 +08:00			`import os, sys`
			`# append 2 parent directories to sys.path to import crawl4ai`
			`parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))`
			`sys.path.append(parent_dir)`
			`parent_parent_dir = os.path.dirname(parent_dir)`
			`sys.path.append(parent_parent_dir)`

			`import asyncio`
			`from crawl4ai import *`

			`async def test_crawler():`
			`# Setup configurations`
			`browser_config = BrowserConfig(headless=True, verbose=False)`
			`crawler_config = CrawlerRunConfig(`
			`cache_mode=CacheMode.BYPASS,`
			`markdown_generator=DefaultMarkdownGenerator(`
			`content_filter=PruningContentFilter(`
			`threshold=0.48,`
			`threshold_type="fixed",`
			`min_word_threshold=0`
			`)`
			`),`
			`)`

			`# Test URLs - mix of different sites`
			`urls = [`
			`"http://example.com",`
			`"http://example.org",`
			`"http://example.net",`
			`] * 10 # 15 total URLs`

			`async with AsyncWebCrawler(config=browser_config) as crawler:`
			`print("\n=== Testing Streaming Mode ===")`
			`async for result in await crawler.arun_many(`
			`urls=urls,`
			`config=crawler_config,`
			`stream=True,`
			`verbose=True`
			`):`
			`print(f"Received result for: {result.url} - Success: {result.success}")`

			`print("\n=== Testing Batch Mode ===")`
			`results = await crawler.arun_many(`
			`urls=urls,`
			`config=crawler_config,`
			`stream=False,`
			`verbose=True`
			`)`
			`print(f"Received all {len(results)} results at once")`
			`for result in results:`
			`print(f"Batch result for: {result.url} - Success: {result.success}")`

			`if __name__ == "__main__":`
			`asyncio.run(test_crawler())`