crawl4ai/tests/general/test_stream_dispatch.py

import os, sys
# append 2 parent directories to sys.path to import crawl4ai
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(parent_dir)
parent_parent_dir = os.path.dirname(parent_dir)
sys.path.append(parent_parent_dir)


import asyncio
from typing import List
from crawl4ai import *
from crawl4ai.async_dispatcher import MemoryAdaptiveDispatcher

async def test_streaming():
    browser_config = BrowserConfig(headless=True, verbose=True)
    crawler_config = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS,
        markdown_generator=DefaultMarkdownGenerator(
            # content_filter=PruningContentFilter(
            #     threshold=0.48, 
            #     threshold_type="fixed", 
            #     min_word_threshold=0
            # )
        ),
    )

    urls = ["http://example.com"] * 10
    
    async with AsyncWebCrawler(config=browser_config) as crawler:
        dispatcher = MemoryAdaptiveDispatcher(
            max_session_permit=5,
            check_interval=0.5
        )
        
        async for result in dispatcher.run_urls_stream(urls, crawler, crawler_config):
            print(f"Got result for {result.url} - Success: {result.result.success}")

if __name__ == "__main__":
    asyncio.run(test_streaming())
feat(dispatcher): add streaming support for URL processing Add new streaming capability to the MemoryAdaptiveDispatcher and AsyncWebCrawler to allow processing URLs with real-time result streaming. This enables processing results as they become available rather than waiting for all URLs to complete. Key changes: - Add run_urls_stream method to MemoryAdaptiveDispatcher - Update AsyncWebCrawler.arun_many to support streaming mode - Add result queue for better result handling - Improve type hints and documentation BREAKING CHANGE: The return type of arun_many now depends on the 'stream' parameter, returning either List[CrawlResult] or AsyncGenerator[CrawlResult, None] 2025-01-19 14:03:34 +08:00			`import os, sys`
			`# append 2 parent directories to sys.path to import crawl4ai`
			`parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))`
			`sys.path.append(parent_dir)`
			`parent_parent_dir = os.path.dirname(parent_dir)`
			`sys.path.append(parent_parent_dir)`


			`import asyncio`
			`from typing import List`
			`from crawl4ai import *`
			`from crawl4ai.async_dispatcher import MemoryAdaptiveDispatcher`

			`async def test_streaming():`
			`browser_config = BrowserConfig(headless=True, verbose=True)`
			`crawler_config = CrawlerRunConfig(`
			`cache_mode=CacheMode.BYPASS,`
			`markdown_generator=DefaultMarkdownGenerator(`
			`# content_filter=PruningContentFilter(`
			`# threshold=0.48,`
			`# threshold_type="fixed",`
			`# min_word_threshold=0`
			`# )`
			`),`
			`)`

			`urls = ["http://example.com"] * 10`

			`async with AsyncWebCrawler(config=browser_config) as crawler:`
			`dispatcher = MemoryAdaptiveDispatcher(`
			`max_session_permit=5,`
			`check_interval=0.5`
			`)`

			`async for result in dispatcher.run_urls_stream(urls, crawler, crawler_config):`
			`print(f"Got result for {result.url} - Success: {result.result.success}")`

			`if __name__ == "__main__":`
			`asyncio.run(test_streaming())`