crawl4ai/deploy/docker/api.py

import os
import json
import logging
from typing import Optional, AsyncGenerator
from urllib.parse import unquote
from fastapi import HTTPException, Request, status
from fastapi.background import BackgroundTasks
from fastapi.responses import JSONResponse
from redis import asyncio as aioredis

from crawl4ai import (
    AsyncWebCrawler,
    CrawlerRunConfig,
    LLMExtractionStrategy,
    CacheMode
)
from crawl4ai.content_filter_strategy import (
    PruningContentFilter,
    BM25ContentFilter,
    LLMContentFilter
)
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy

from utils import (
    TaskStatus,
    FilterType,
    get_base_url,
    is_task_id,
    should_cleanup_task,
    decode_redis_hash
)

logger = logging.getLogger(__name__)

async def process_llm_extraction(
    redis: aioredis.Redis,
    config: dict,
    task_id: str,
    url: str,
    instruction: str,
    schema: Optional[str] = None,
    cache: str = "0"
) -> None:
    """Process LLM extraction in background."""
    try:
        llm_strategy = LLMExtractionStrategy(
            provider=config["llm"]["provider"],
            api_token=os.environ.get(config["llm"].get("api_key_env", None), ""),
            instruction=instruction,
            schema=json.loads(schema) if schema else None,
        )

        cache_mode = CacheMode.ENABLED if cache == "1" else CacheMode.BYPASS

        async with AsyncWebCrawler() as crawler:
            result = await crawler.arun(
                url=url,
                config=CrawlerRunConfig(
                    extraction_strategy=llm_strategy,
                    scraping_strategy=LXMLWebScrapingStrategy(),
                    cache_mode=cache_mode
                )
            )

        if not result.success:
            await redis.hset(f"task:{task_id}", mapping={
                "status": TaskStatus.FAILED,
                "error": result.error_message
            })
            return

        content = json.loads(result.extracted_content)
        await redis.hset(f"task:{task_id}", mapping={
            "status": TaskStatus.COMPLETED,
            "result": json.dumps(content)
        })

    except Exception as e:
        logger.error(f"LLM extraction error: {str(e)}", exc_info=True)
        await redis.hset(f"task:{task_id}", mapping={
            "status": TaskStatus.FAILED,
            "error": str(e)
        })

async def handle_markdown_request(
    url: str,
    filter_type: FilterType,
    query: Optional[str] = None,
    cache: str = "0",
    config: Optional[dict] = None
) -> str:
    """Handle markdown generation requests."""
    try:
        decoded_url = unquote(url)
        if not decoded_url.startswith(('http://', 'https://')):
            decoded_url = 'https://' + decoded_url

        if filter_type == FilterType.RAW:
            md_generator = DefaultMarkdownGenerator()
        else:
            content_filter = {
                FilterType.FIT: PruningContentFilter(),
                FilterType.BM25: BM25ContentFilter(user_query=query or ""),
                FilterType.LLM: LLMContentFilter(
                    provider=config["llm"]["provider"],
                    api_token=os.environ.get(config["llm"].get("api_key_env", None), ""),
                    instruction=query or "Extract main content"
                )
            }[filter_type]
            md_generator = DefaultMarkdownGenerator(content_filter=content_filter)

        cache_mode = CacheMode.ENABLED if cache == "1" else CacheMode.BYPASS

        async with AsyncWebCrawler() as crawler:
            result = await crawler.arun(
                url=decoded_url,
                config=CrawlerRunConfig(
                    markdown_generator=md_generator,
                    scraping_strategy=LXMLWebScrapingStrategy(),
                    cache_mode=cache_mode
                )
            )
            
            if not result.success:
                raise HTTPException(
                    status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
                    detail=result.error_message
                )

            return (result.markdown_v2.raw_markdown 
                   if filter_type == FilterType.RAW 
                   else result.markdown_v2.fit_markdown)

    except Exception as e:
        logger.error(f"Markdown error: {str(e)}", exc_info=True)
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=str(e)
        )

async def handle_llm_request(
    redis: aioredis.Redis,
    background_tasks: BackgroundTasks,
    request: Request,
    input_path: str,
    query: Optional[str] = None,
    schema: Optional[str] = None,
    cache: str = "0",
    config: Optional[dict] = None
) -> JSONResponse:
    """Handle LLM extraction requests."""
    base_url = get_base_url(request)
    
    try:
        if is_task_id(input_path):
            return await handle_task_status(
                redis, input_path, base_url
            )

        if not query:
            return JSONResponse({
                "message": "Please provide an instruction",
                "_links": {
                    "example": {
                        "href": f"{base_url}/llm/{input_path}?q=Extract+main+content",
                        "title": "Try this example"
                    }
                }
            })

        return await create_new_task(
            redis,
            background_tasks,
            input_path,
            query,
            schema,
            cache,
            base_url,
            config
        )

    except Exception as e:
        logger.error(f"LLM endpoint error: {str(e)}", exc_info=True)
        return JSONResponse({
            "error": str(e),
            "_links": {
                "retry": {"href": str(request.url)}
            }
        }, status_code=status.HTTP_500_INTERNAL_SERVER_ERROR)

async def handle_task_status(
    redis: aioredis.Redis,
    task_id: str,
    base_url: str
) -> JSONResponse:
    """Handle task status check requests."""
    task = await redis.hgetall(f"task:{task_id}")
    if not task:
        raise HTTPException(
            status_code=status.HTTP_404_NOT_FOUND,
            detail="Task not found"
        )

    task = decode_redis_hash(task)
    response = create_task_response(task, task_id, base_url)

    if task["status"] in [TaskStatus.COMPLETED, TaskStatus.FAILED]:
        if should_cleanup_task(task["created_at"]):
            await redis.delete(f"task:{task_id}")

    return JSONResponse(response)

async def create_new_task(
    redis: aioredis.Redis,
    background_tasks: BackgroundTasks,
    input_path: str,
    query: str,
    schema: Optional[str],
    cache: str,
    base_url: str,
    config: dict
) -> JSONResponse:
    """Create and initialize a new task."""
    decoded_url = unquote(input_path)
    if not decoded_url.startswith(('http://', 'https://')):
        decoded_url = 'https://' + decoded_url

    from datetime import datetime
    task_id = f"llm_{int(datetime.now().timestamp())}_{id(background_tasks)}"
    
    await redis.hset(f"task:{task_id}", mapping={
        "status": TaskStatus.PROCESSING,
        "created_at": datetime.now().isoformat(),
        "url": decoded_url
    })

    background_tasks.add_task(
        process_llm_extraction,
        redis,
        config,
        task_id,
        decoded_url,
        query,
        schema,
        cache
    )

    return JSONResponse({
        "task_id": task_id,
        "status": TaskStatus.PROCESSING,
        "url": decoded_url,
        "_links": {
            "self": {"href": f"{base_url}/llm/{task_id}"},
            "status": {"href": f"{base_url}/llm/{task_id}"}
        }
    })

def create_task_response(task: dict, task_id: str, base_url: str) -> dict:
    """Create response for task status check."""
    response = {
        "task_id": task_id,
        "status": task["status"],
        "created_at": task["created_at"],
        "url": task["url"],
        "_links": {
            "self": {"href": f"{base_url}/llm/{task_id}"},
            "refresh": {"href": f"{base_url}/llm/{task_id}"}
        }
    }

    if task["status"] == TaskStatus.COMPLETED:
        response["result"] = json.loads(task["result"])
    elif task["status"] == TaskStatus.FAILED:
        response["error"] = task["error"]

    return response

async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator) -> AsyncGenerator[bytes, None]:
    """Stream results with heartbeats and completion markers."""
    import asyncio
    import json
    from utils import datetime_handler

    try:
        async for result in results_gen:
            try:
                result_dict = result.model_dump()
                logger.info(f"Streaming result for {result_dict.get('url', 'unknown')}")
                data = json.dumps(result_dict, default=datetime_handler) + "\n"
                yield data.encode('utf-8')
            except Exception as e:
                logger.error(f"Serialization error: {e}")
                error_response = {"error": str(e), "url": getattr(result, 'url', 'unknown')}
                yield (json.dumps(error_response) + "\n").encode('utf-8')

        yield json.dumps({"status": "completed"}).encode('utf-8')
        
    except asyncio.CancelledError:
        logger.warning("Client disconnected during streaming")
    finally:
        try:
            await crawler.close()
        except Exception as e:
            logger.error(f"Crawler cleanup error: {e}")
refactor(docker): improve server architecture and configuration Complete overhaul of Docker deployment setup with improved architecture: - Add Redis integration for task management - Implement rate limiting and security middleware - Add Prometheus metrics and health checks - Improve error handling and logging - Add support for streaming responses - Implement proper configuration management - Add platform-specific optimizations for ARM64/AMD64 BREAKING CHANGE: Docker deployment now requires Redis and new config.yml structure 2025-02-02 20:19:51 +08:00			`import os`
			`import json`
			`import logging`
			`from typing import Optional, AsyncGenerator`
			`from urllib.parse import unquote`
			`from fastapi import HTTPException, Request, status`
			`from fastapi.background import BackgroundTasks`
			`from fastapi.responses import JSONResponse`
			`from redis import asyncio as aioredis`

			`from crawl4ai import (`
			`AsyncWebCrawler,`
			`CrawlerRunConfig,`
			`LLMExtractionStrategy,`
			`CacheMode`
			`)`
			`from crawl4ai.content_filter_strategy import (`
			`PruningContentFilter,`
			`BM25ContentFilter,`
			`LLMContentFilter`
			`)`
			`from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator`
			`from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy`

			`from utils import (`
			`TaskStatus,`
			`FilterType,`
			`get_base_url,`
			`is_task_id,`
			`should_cleanup_task,`
			`decode_redis_hash`
			`)`

			`logger = logging.getLogger(__name__)`

			`async def process_llm_extraction(`
			`redis: aioredis.Redis,`
			`config: dict,`
			`task_id: str,`
			`url: str,`
			`instruction: str,`
			`schema: Optional[str] = None,`
			`cache: str = "0"`
			`) -> None:`
			`"""Process LLM extraction in background."""`
			`try:`
			`llm_strategy = LLMExtractionStrategy(`
			`provider=config["llm"]["provider"],`
			`api_token=os.environ.get(config["llm"].get("api_key_env", None), ""),`
			`instruction=instruction,`
			`schema=json.loads(schema) if schema else None,`
			`)`

			`cache_mode = CacheMode.ENABLED if cache == "1" else CacheMode.BYPASS`

			`async with AsyncWebCrawler() as crawler:`
			`result = await crawler.arun(`
			`url=url,`
			`config=CrawlerRunConfig(`
			`extraction_strategy=llm_strategy,`
			`scraping_strategy=LXMLWebScrapingStrategy(),`
			`cache_mode=cache_mode`
			`)`
			`)`

			`if not result.success:`
			`await redis.hset(f"task:{task_id}", mapping={`
			`"status": TaskStatus.FAILED,`
			`"error": result.error_message`
			`})`
			`return`

			`content = json.loads(result.extracted_content)`
			`await redis.hset(f"task:{task_id}", mapping={`
			`"status": TaskStatus.COMPLETED,`
			`"result": json.dumps(content)`
			`})`

			`except Exception as e:`
			`logger.error(f"LLM extraction error: {str(e)}", exc_info=True)`
			`await redis.hset(f"task:{task_id}", mapping={`
			`"status": TaskStatus.FAILED,`
			`"error": str(e)`
			`})`

			`async def handle_markdown_request(`
			`url: str,`
			`filter_type: FilterType,`
			`query: Optional[str] = None,`
			`cache: str = "0",`
			`config: Optional[dict] = None`
			`) -> str:`
			`"""Handle markdown generation requests."""`
			`try:`
			`decoded_url = unquote(url)`
			`if not decoded_url.startswith(('http://', 'https://')):`
			`decoded_url = 'https://' + decoded_url`

			`if filter_type == FilterType.RAW:`
			`md_generator = DefaultMarkdownGenerator()`
			`else:`
			`content_filter = {`
			`FilterType.FIT: PruningContentFilter(),`
			`FilterType.BM25: BM25ContentFilter(user_query=query or ""),`
			`FilterType.LLM: LLMContentFilter(`
			`provider=config["llm"]["provider"],`
			`api_token=os.environ.get(config["llm"].get("api_key_env", None), ""),`
			`instruction=query or "Extract main content"`
			`)`
			`}[filter_type]`
			`md_generator = DefaultMarkdownGenerator(content_filter=content_filter)`

			`cache_mode = CacheMode.ENABLED if cache == "1" else CacheMode.BYPASS`

			`async with AsyncWebCrawler() as crawler:`
			`result = await crawler.arun(`
			`url=decoded_url,`
			`config=CrawlerRunConfig(`
			`markdown_generator=md_generator,`
			`scraping_strategy=LXMLWebScrapingStrategy(),`
			`cache_mode=cache_mode`
			`)`
			`)`

			`if not result.success:`
			`raise HTTPException(`
			`status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,`
			`detail=result.error_message`
			`)`

			`return (result.markdown_v2.raw_markdown`
			`if filter_type == FilterType.RAW`
			`else result.markdown_v2.fit_markdown)`

			`except Exception as e:`
			`logger.error(f"Markdown error: {str(e)}", exc_info=True)`
			`raise HTTPException(`
			`status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,`
			`detail=str(e)`
			`)`

			`async def handle_llm_request(`
			`redis: aioredis.Redis,`
			`background_tasks: BackgroundTasks,`
			`request: Request,`
			`input_path: str,`
			`query: Optional[str] = None,`
			`schema: Optional[str] = None,`
			`cache: str = "0",`
			`config: Optional[dict] = None`
			`) -> JSONResponse:`
			`"""Handle LLM extraction requests."""`
			`base_url = get_base_url(request)`

			`try:`
			`if is_task_id(input_path):`
			`return await handle_task_status(`
			`redis, input_path, base_url`
			`)`

			`if not query:`
			`return JSONResponse({`
			`"message": "Please provide an instruction",`
			`"_links": {`
			`"example": {`
			`"href": f"{base_url}/llm/{input_path}?q=Extract+main+content",`
			`"title": "Try this example"`
			`}`
			`}`
			`})`

			`return await create_new_task(`
			`redis,`
			`background_tasks,`
			`input_path,`
			`query,`
			`schema,`
			`cache,`
			`base_url,`
			`config`
			`)`

			`except Exception as e:`
			`logger.error(f"LLM endpoint error: {str(e)}", exc_info=True)`
			`return JSONResponse({`
			`"error": str(e),`
			`"_links": {`
			`"retry": {"href": str(request.url)}`
			`}`
			`}, status_code=status.HTTP_500_INTERNAL_SERVER_ERROR)`

			`async def handle_task_status(`
			`redis: aioredis.Redis,`
			`task_id: str,`
			`base_url: str`
			`) -> JSONResponse:`
			`"""Handle task status check requests."""`
			`task = await redis.hgetall(f"task:{task_id}")`
			`if not task:`
			`raise HTTPException(`
			`status_code=status.HTTP_404_NOT_FOUND,`
			`detail="Task not found"`
			`)`

			`task = decode_redis_hash(task)`
			`response = create_task_response(task, task_id, base_url)`

			`if task["status"] in [TaskStatus.COMPLETED, TaskStatus.FAILED]:`
			`if should_cleanup_task(task["created_at"]):`
			`await redis.delete(f"task:{task_id}")`

			`return JSONResponse(response)`

			`async def create_new_task(`
			`redis: aioredis.Redis,`
			`background_tasks: BackgroundTasks,`
			`input_path: str,`
			`query: str,`
			`schema: Optional[str],`
			`cache: str,`
			`base_url: str,`
			`config: dict`
			`) -> JSONResponse:`
			`"""Create and initialize a new task."""`
			`decoded_url = unquote(input_path)`
			`if not decoded_url.startswith(('http://', 'https://')):`
			`decoded_url = 'https://' + decoded_url`

			`from datetime import datetime`
			`task_id = f"llm_{int(datetime.now().timestamp())}_{id(background_tasks)}"`

			`await redis.hset(f"task:{task_id}", mapping={`
			`"status": TaskStatus.PROCESSING,`
			`"created_at": datetime.now().isoformat(),`
			`"url": decoded_url`
			`})`

			`background_tasks.add_task(`
			`process_llm_extraction,`
			`redis,`
			`config,`
			`task_id,`
			`decoded_url,`
			`query,`
			`schema,`
			`cache`
			`)`

			`return JSONResponse({`
			`"task_id": task_id,`
			`"status": TaskStatus.PROCESSING,`
			`"url": decoded_url,`
			`"_links": {`
			`"self": {"href": f"{base_url}/llm/{task_id}"},`
			`"status": {"href": f"{base_url}/llm/{task_id}"}`
			`}`
			`})`

			`def create_task_response(task: dict, task_id: str, base_url: str) -> dict:`
			`"""Create response for task status check."""`
			`response = {`
			`"task_id": task_id,`
			`"status": task["status"],`
			`"created_at": task["created_at"],`
			`"url": task["url"],`
			`"_links": {`
			`"self": {"href": f"{base_url}/llm/{task_id}"},`
			`"refresh": {"href": f"{base_url}/llm/{task_id}"}`
			`}`
			`}`

			`if task["status"] == TaskStatus.COMPLETED:`
			`response["result"] = json.loads(task["result"])`
			`elif task["status"] == TaskStatus.FAILED:`
			`response["error"] = task["error"]`

			`return response`

			`async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator) -> AsyncGenerator[bytes, None]:`
			`"""Stream results with heartbeats and completion markers."""`
			`import asyncio`
			`import json`
			`from utils import datetime_handler`

			`try:`
			`async for result in results_gen:`
			`try:`
			`result_dict = result.model_dump()`
			`logger.info(f"Streaming result for {result_dict.get('url', 'unknown')}")`
			`data = json.dumps(result_dict, default=datetime_handler) + "\n"`
			`yield data.encode('utf-8')`
			`except Exception as e:`
			`logger.error(f"Serialization error: {e}")`
			`error_response = {"error": str(e), "url": getattr(result, 'url', 'unknown')}`
			`yield (json.dumps(error_response) + "\n").encode('utf-8')`

			`yield json.dumps({"status": "completed"}).encode('utf-8')`

			`except asyncio.CancelledError:`
			`logger.warning("Client disconnected during streaming")`
			`finally:`
			`try:`
			`await crawler.close()`
			`except Exception as e:`
			`logger.error(f"Crawler cleanup error: {e}")`