crawl4ai/deploy/docker/server.py

# pyright: ignore
from fastapi import FastAPI, HTTPException
from fastapi.responses import StreamingResponse
import json
import asyncio
from typing import AsyncGenerator
from datetime import datetime
from crawl4ai import (
    BrowserConfig,
    CrawlerRunConfig,
    AsyncWebCrawler,
    MemoryAdaptiveDispatcher,
    RateLimiter,
)
from .models import CrawlRequest, CrawlResponse

class CrawlJSONEncoder(json.JSONEncoder):
    """Custom JSON encoder for crawler results"""
    def default(self, obj):
        if isinstance(obj, datetime):
            return obj.isoformat()
        if isinstance(obj, bytes):
            return obj.decode('utf-8', errors='ignore')
        if hasattr(obj, 'model_dump'):
            return obj.model_dump()
        if hasattr(obj, '__dict__'):
            return {k: v for k, v in obj.__dict__.items() if not k.startswith('_')}
        return str(obj)  # Fallback to string representation

def serialize_result(result) -> dict:
    """Safely serialize a crawler result"""
    try:
        # Convert to dict handling special cases
        if hasattr(result, 'model_dump'):
            result_dict = result.model_dump()
        else:
            result_dict = {
                k: v for k, v in result.__dict__.items()
                if not k.startswith('_')
            }

        # Remove known non-serializable objects
        result_dict.pop('ssl_certificate', None)
        result_dict.pop('downloaded_files', None)

        return result_dict
    except Exception as e:
        print(f"Error serializing result: {e}")
        return {"error": str(e), "url": getattr(result, 'url', 'unknown')}

app = FastAPI(title="Crawl4AI API")

async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator) -> AsyncGenerator[bytes, None]:
    """Stream results and manage crawler lifecycle"""
    try:
        async for result in results_gen:
            try:
                # Handle serialization of result
                result_dict = serialize_result(result)
                # Remove non-serializable objects
                print(f"Streaming result for URL: {result_dict['url']}, Success: {result_dict['success']}")
                yield (json.dumps(result_dict, cls=CrawlJSONEncoder) + "\n").encode('utf-8')
            except Exception as e:
                # Log error but continue streaming
                print(f"Error serializing result: {e}")
                error_response = {
                    "error": str(e),
                    "url": getattr(result, 'url', 'unknown')
                }
                yield (json.dumps(error_response) + "\n").encode('utf-8')
    except asyncio.CancelledError:
        # Handle client disconnection gracefully
        print("Client disconnected, cleaning up...")
    finally:
        # Ensure crawler cleanup happens in all cases
        try:
            await crawler.close()
        except Exception as e:
            print(f"Error closing crawler: {e}")

@app.post("/crawl")
async def crawl(request: CrawlRequest):
    browser_config, crawler_config = request.get_configs()

    dispatcher = MemoryAdaptiveDispatcher(
        memory_threshold_percent=75.0,
        rate_limiter=RateLimiter(base_delay=(1.0, 2.0)),
        # monitor=CrawlerMonitor(display_mode=DisplayMode.DETAILED)
    )

    try:
        if crawler_config.stream:
            # For streaming, manage crawler lifecycle manually
            crawler = AsyncWebCrawler(config=browser_config)
            await crawler.start()

            results_gen = await crawler.arun_many(
                urls=request.urls,
                config=crawler_config,
                dispatcher=dispatcher
            )

            return StreamingResponse(
                stream_results(crawler, results_gen),
                media_type='application/x-ndjson'
            )
        else:
            # For non-streaming, use context manager
            async with AsyncWebCrawler(config=browser_config) as crawler:
                results = await crawler.arun_many(
                    urls=request.urls,
                    config=crawler_config,
                    dispatcher=dispatcher
                )
                # Handle serialization of results
                results_dict = []
                for result in results:
                    try:
                        result_dict = {
                            k: v for k, v in (result.model_dump() if hasattr(result, 'model_dump')
                                            else result.__dict__).items()
                            if not k.startswith('_')
                        }
                        result_dict.pop('ssl_certificate', None)
                        result_dict.pop('downloaded_files', None)
                        results_dict.append(result_dict)
                    except Exception as e:
                        print(f"Error serializing result: {e}")
                        continue

                return CrawlResponse(success=True, results=results_dict)
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

@app.get("/schema")
async def get_schema():
    """Return config schemas for client validation"""
    return {
        "browser": BrowserConfig.model_json_schema(),
        "crawler": CrawlerRunConfig.model_json_schema()
    }


if __name__ == "__main__":
    import uvicorn
    # Run in auto reload mode
    # WARNING:  You must pass the application as an import string to enable 'reload' or 'workers'.
    uvicorn.run("server:app", host="0.0.0.0", port=8000, reload=True)
feat(docker): add Docker deployment configuration and API server Add Docker deployment setup with FastAPI server implementation for Crawl4AI: - Create Dockerfile with Python 3.10 and Playwright dependencies - Implement FastAPI server with streaming and non-streaming endpoints - Add request/response models and JSON serialization - Include test script for API verification Also includes: - Update .gitignore for Continue development files - Add project rules in .continuerules - Clean up async_dispatcher.py formatting 2025-01-31 15:22:21 +08:00			`# pyright: ignore`
			`from fastapi import FastAPI, HTTPException`
			`from fastapi.responses import StreamingResponse`
			`import json`
			`import asyncio`
			`from typing import AsyncGenerator`
			`from datetime import datetime`
			`from crawl4ai import (`
			`BrowserConfig,`
			`CrawlerRunConfig,`
			`AsyncWebCrawler,`
			`MemoryAdaptiveDispatcher,`
			`RateLimiter,`
			`)`
			`from .models import CrawlRequest, CrawlResponse`

			`class CrawlJSONEncoder(json.JSONEncoder):`
			`"""Custom JSON encoder for crawler results"""`
			`def default(self, obj):`
			`if isinstance(obj, datetime):`
			`return obj.isoformat()`
			`if isinstance(obj, bytes):`
			`return obj.decode('utf-8', errors='ignore')`
			`if hasattr(obj, 'model_dump'):`
			`return obj.model_dump()`
			`if hasattr(obj, '__dict__'):`
			`return {k: v for k, v in obj.__dict__.items() if not k.startswith('_')}`
			`return str(obj) # Fallback to string representation`

			`def serialize_result(result) -> dict:`
			`"""Safely serialize a crawler result"""`
			`try:`
			`# Convert to dict handling special cases`
			`if hasattr(result, 'model_dump'):`
			`result_dict = result.model_dump()`
			`else:`
			`result_dict = {`
			`k: v for k, v in result.__dict__.items()`
			`if not k.startswith('_')`
			`}`

			`# Remove known non-serializable objects`
			`result_dict.pop('ssl_certificate', None)`
			`result_dict.pop('downloaded_files', None)`

			`return result_dict`
			`except Exception as e:`
			`print(f"Error serializing result: {e}")`
			`return {"error": str(e), "url": getattr(result, 'url', 'unknown')}`

			`app = FastAPI(title="Crawl4AI API")`

			`async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator) -> AsyncGenerator[bytes, None]:`
			`"""Stream results and manage crawler lifecycle"""`
			`try:`
			`async for result in results_gen:`
			`try:`
			`# Handle serialization of result`
			`result_dict = serialize_result(result)`
			`# Remove non-serializable objects`
			`print(f"Streaming result for URL: {result_dict['url']}, Success: {result_dict['success']}")`
			`yield (json.dumps(result_dict, cls=CrawlJSONEncoder) + "\n").encode('utf-8')`
			`except Exception as e:`
			`# Log error but continue streaming`
			`print(f"Error serializing result: {e}")`
			`error_response = {`
			`"error": str(e),`
			`"url": getattr(result, 'url', 'unknown')`
			`}`
			`yield (json.dumps(error_response) + "\n").encode('utf-8')`
			`except asyncio.CancelledError:`
			`# Handle client disconnection gracefully`
			`print("Client disconnected, cleaning up...")`
			`finally:`
			`# Ensure crawler cleanup happens in all cases`
			`try:`
			`await crawler.close()`
			`except Exception as e:`
			`print(f"Error closing crawler: {e}")`

			`@app.post("/crawl")`
			`async def crawl(request: CrawlRequest):`
			`browser_config, crawler_config = request.get_configs()`

			`dispatcher = MemoryAdaptiveDispatcher(`
			`memory_threshold_percent=75.0,`
			`rate_limiter=RateLimiter(base_delay=(1.0, 2.0)),`
			`# monitor=CrawlerMonitor(display_mode=DisplayMode.DETAILED)`
			`)`

			`try:`
			`if crawler_config.stream:`
			`# For streaming, manage crawler lifecycle manually`
			`crawler = AsyncWebCrawler(config=browser_config)`
			`await crawler.start()`

			`results_gen = await crawler.arun_many(`
			`urls=request.urls,`
			`config=crawler_config,`
			`dispatcher=dispatcher`
			`)`

			`return StreamingResponse(`
			`stream_results(crawler, results_gen),`
			`media_type='application/x-ndjson'`
			`)`
			`else:`
			`# For non-streaming, use context manager`
			`async with AsyncWebCrawler(config=browser_config) as crawler:`
			`results = await crawler.arun_many(`
			`urls=request.urls,`
			`config=crawler_config,`
			`dispatcher=dispatcher`
			`)`
			`# Handle serialization of results`
			`results_dict = []`
			`for result in results:`
			`try:`
			`result_dict = {`
			`k: v for k, v in (result.model_dump() if hasattr(result, 'model_dump')`
			`else result.__dict__).items()`
			`if not k.startswith('_')`
			`}`
			`result_dict.pop('ssl_certificate', None)`
			`result_dict.pop('downloaded_files', None)`
			`results_dict.append(result_dict)`
			`except Exception as e:`
			`print(f"Error serializing result: {e}")`
			`continue`

			`return CrawlResponse(success=True, results=results_dict)`
			`except Exception as e:`
			`raise HTTPException(status_code=500, detail=str(e))`

			`@app.get("/schema")`
			`async def get_schema():`
			`"""Return config schemas for client validation"""`
			`return {`
			`"browser": BrowserConfig.model_json_schema(),`
			`"crawler": CrawlerRunConfig.model_json_schema()`
			`}`


			`if __name__ == "__main__":`
			`import uvicorn`
			`# Run in auto reload mode`
			`# WARNING: You must pass the application as an import string to enable 'reload' or 'workers'.`
			`uvicorn.run("server:app", host="0.0.0.0", port=8000, reload=True)`