2025-04-20 20:14:26 +08:00
|
|
|
|
# ───────────────────────── server.py ─────────────────────────
|
|
|
|
|
"""
|
|
|
|
|
Crawl4AI FastAPI entry‑point
|
|
|
|
|
• Browser pool + global page cap
|
|
|
|
|
• Rate‑limiting, security, metrics
|
|
|
|
|
• /crawl, /crawl/stream, /md, /llm endpoints
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
# ── stdlib & 3rd‑party imports ───────────────────────────────
|
|
|
|
|
import os, sys, time, asyncio
|
|
|
|
|
from typing import List, Optional, Dict
|
2025-04-18 22:26:24 +08:00
|
|
|
|
from contextlib import asynccontextmanager
|
2025-04-20 20:14:26 +08:00
|
|
|
|
import pathlib
|
|
|
|
|
|
|
|
|
|
from fastapi import (
|
|
|
|
|
FastAPI, HTTPException, Request, Path, Query, Depends
|
|
|
|
|
)
|
|
|
|
|
from fastapi.responses import (
|
|
|
|
|
StreamingResponse, RedirectResponse, PlainTextResponse, JSONResponse
|
|
|
|
|
)
|
2025-02-02 20:19:51 +08:00
|
|
|
|
from fastapi.middleware.httpsredirect import HTTPSRedirectMiddleware
|
|
|
|
|
from fastapi.middleware.trustedhost import TrustedHostMiddleware
|
2025-04-20 20:14:26 +08:00
|
|
|
|
from fastapi.staticfiles import StaticFiles
|
|
|
|
|
|
|
|
|
|
import ast, crawl4ai as _c4
|
2025-02-02 20:19:51 +08:00
|
|
|
|
from pydantic import BaseModel, Field
|
|
|
|
|
from slowapi import Limiter
|
|
|
|
|
from slowapi.util import get_remote_address
|
|
|
|
|
from prometheus_fastapi_instrumentator import Instrumentator
|
2025-02-18 22:07:13 +08:00
|
|
|
|
from redis import asyncio as aioredis
|
2025-04-18 22:26:24 +08:00
|
|
|
|
|
2025-04-20 20:14:26 +08:00
|
|
|
|
# ── internal imports (after sys.path append) ─────────────────
|
2025-04-18 22:26:24 +08:00
|
|
|
|
sys.path.append(os.path.dirname(os.path.realpath(__file__)))
|
2025-04-20 20:14:26 +08:00
|
|
|
|
from utils import (
|
|
|
|
|
FilterType, load_config, setup_logging, verify_email_domain
|
2025-04-18 22:26:24 +08:00
|
|
|
|
)
|
2025-04-20 20:14:26 +08:00
|
|
|
|
from api import (
|
|
|
|
|
handle_markdown_request, handle_llm_qa,
|
|
|
|
|
handle_stream_crawl_request, handle_crawl_request,
|
|
|
|
|
stream_results
|
2025-02-02 20:19:51 +08:00
|
|
|
|
)
|
2025-04-20 20:14:26 +08:00
|
|
|
|
from auth import create_access_token, get_token_dependency, TokenRequest
|
|
|
|
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
|
|
|
|
from crawler_pool import get_crawler, close_all, janitor
|
2025-02-01 20:10:13 +08:00
|
|
|
|
|
2025-04-20 20:14:26 +08:00
|
|
|
|
# ────────────────── configuration / logging ──────────────────
|
|
|
|
|
config = load_config()
|
|
|
|
|
setup_logging(config)
|
2025-04-18 22:26:24 +08:00
|
|
|
|
|
2025-04-20 20:14:26 +08:00
|
|
|
|
__version__ = "0.5.1-d1"
|
|
|
|
|
|
|
|
|
|
# ── global page semaphore (hard cap) ─────────────────────────
|
|
|
|
|
MAX_PAGES = config["crawler"]["pool"].get("max_pages", 30)
|
|
|
|
|
GLOBAL_SEM = asyncio.Semaphore(MAX_PAGES)
|
|
|
|
|
|
|
|
|
|
# import logging
|
|
|
|
|
# page_log = logging.getLogger("page_cap")
|
|
|
|
|
# orig_arun = AsyncWebCrawler.arun
|
|
|
|
|
# async def capped_arun(self, *a, **kw):
|
|
|
|
|
# await GLOBAL_SEM.acquire() # ← take slot
|
|
|
|
|
# try:
|
|
|
|
|
# in_flight = MAX_PAGES - GLOBAL_SEM._value # used permits
|
|
|
|
|
# page_log.info("🕸️ pages_in_flight=%s / %s", in_flight, MAX_PAGES)
|
|
|
|
|
# return await orig_arun(self, *a, **kw)
|
|
|
|
|
# finally:
|
|
|
|
|
# GLOBAL_SEM.release() # ← free slot
|
|
|
|
|
|
|
|
|
|
orig_arun = AsyncWebCrawler.arun
|
|
|
|
|
async def capped_arun(self, *a, **kw):
|
|
|
|
|
async with GLOBAL_SEM:
|
|
|
|
|
return await orig_arun(self, *a, **kw)
|
|
|
|
|
AsyncWebCrawler.arun = capped_arun
|
|
|
|
|
|
|
|
|
|
# ───────────────────── FastAPI lifespan ──────────────────────
|
2025-04-18 22:26:24 +08:00
|
|
|
|
@asynccontextmanager
|
2025-04-20 20:14:26 +08:00
|
|
|
|
async def lifespan(_: FastAPI):
|
|
|
|
|
await get_crawler(BrowserConfig(
|
|
|
|
|
extra_args=config["crawler"]["browser"].get("extra_args", []),
|
|
|
|
|
**config["crawler"]["browser"].get("kwargs", {}),
|
|
|
|
|
)) # warm‑up
|
|
|
|
|
app.state.janitor = asyncio.create_task(janitor()) # idle GC
|
|
|
|
|
yield
|
|
|
|
|
app.state.janitor.cancel()
|
|
|
|
|
await close_all()
|
|
|
|
|
|
|
|
|
|
# ───────────────────── FastAPI instance ──────────────────────
|
2025-02-02 20:19:51 +08:00
|
|
|
|
app = FastAPI(
|
|
|
|
|
title=config["app"]["title"],
|
2025-04-18 22:26:24 +08:00
|
|
|
|
version=config["app"]["version"],
|
|
|
|
|
lifespan=lifespan,
|
2025-02-02 20:19:51 +08:00
|
|
|
|
)
|
2025-01-31 15:22:21 +08:00
|
|
|
|
|
2025-04-20 20:14:26 +08:00
|
|
|
|
# ── static playground ──────────────────────────────────────
|
|
|
|
|
STATIC_DIR = pathlib.Path(__file__).parent / "static" / "playground"
|
|
|
|
|
if not STATIC_DIR.exists():
|
|
|
|
|
raise RuntimeError(f"Playground assets not found at {STATIC_DIR}")
|
|
|
|
|
app.mount(
|
|
|
|
|
"/playground",
|
|
|
|
|
StaticFiles(directory=STATIC_DIR, html=True),
|
|
|
|
|
name="play",
|
|
|
|
|
)
|
2025-04-18 22:26:24 +08:00
|
|
|
|
|
2025-04-20 20:14:26 +08:00
|
|
|
|
# Optional nice‑to‑have: opening the root shows the playground
|
|
|
|
|
@app.get("/")
|
|
|
|
|
async def root():
|
|
|
|
|
return RedirectResponse("/playground")
|
2025-02-18 22:07:13 +08:00
|
|
|
|
|
2025-04-20 20:14:26 +08:00
|
|
|
|
# ─────────────────── infra / middleware ─────────────────────
|
|
|
|
|
redis = aioredis.from_url(config["redis"].get("uri", "redis://localhost"))
|
2025-01-31 15:22:21 +08:00
|
|
|
|
|
2025-04-20 20:14:26 +08:00
|
|
|
|
limiter = Limiter(
|
|
|
|
|
key_func=get_remote_address,
|
|
|
|
|
default_limits=[config["rate_limiting"]["default_limit"]],
|
|
|
|
|
storage_uri=config["rate_limiting"]["storage_uri"],
|
|
|
|
|
)
|
2025-01-31 18:00:16 +08:00
|
|
|
|
|
2025-04-20 20:14:26 +08:00
|
|
|
|
def _setup_security(app_: FastAPI):
|
|
|
|
|
sec = config["security"]
|
|
|
|
|
if not sec["enabled"]:
|
|
|
|
|
return
|
|
|
|
|
if sec.get("https_redirect"):
|
|
|
|
|
app_.add_middleware(HTTPSRedirectMiddleware)
|
|
|
|
|
if sec.get("trusted_hosts", []) != ["*"]:
|
|
|
|
|
app_.add_middleware(
|
|
|
|
|
TrustedHostMiddleware, allowed_hosts=sec["trusted_hosts"]
|
|
|
|
|
)
|
|
|
|
|
_setup_security(app)
|
2025-02-02 20:19:51 +08:00
|
|
|
|
|
2025-04-20 20:14:26 +08:00
|
|
|
|
if config["observability"]["prometheus"]["enabled"]:
|
|
|
|
|
Instrumentator().instrument(app).expose(app)
|
2025-04-18 22:26:24 +08:00
|
|
|
|
|
2025-04-20 20:14:26 +08:00
|
|
|
|
token_dep = get_token_dependency(config)
|
2025-04-18 22:26:24 +08:00
|
|
|
|
|
2025-02-02 20:19:51 +08:00
|
|
|
|
@app.middleware("http")
|
|
|
|
|
async def add_security_headers(request: Request, call_next):
|
2025-04-20 20:14:26 +08:00
|
|
|
|
resp = await call_next(request)
|
2025-02-02 20:19:51 +08:00
|
|
|
|
if config["security"]["enabled"]:
|
2025-04-20 20:14:26 +08:00
|
|
|
|
resp.headers.update(config["security"]["headers"])
|
|
|
|
|
return resp
|
|
|
|
|
|
|
|
|
|
# ───────────────── safe config‑dump helper ─────────────────
|
|
|
|
|
ALLOWED_TYPES = {
|
|
|
|
|
"CrawlerRunConfig": CrawlerRunConfig,
|
|
|
|
|
"BrowserConfig": BrowserConfig,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
def _safe_eval_config(expr: str) -> dict:
|
|
|
|
|
"""
|
|
|
|
|
Accept exactly one top‑level call to CrawlerRunConfig(...) or BrowserConfig(...).
|
|
|
|
|
Whatever is inside the parentheses is fine *except* further function calls
|
|
|
|
|
(so no __import__('os') stuff). All public names from crawl4ai are available
|
|
|
|
|
when we eval.
|
|
|
|
|
"""
|
|
|
|
|
tree = ast.parse(expr, mode="eval")
|
|
|
|
|
|
|
|
|
|
# must be a single call
|
|
|
|
|
if not isinstance(tree.body, ast.Call):
|
|
|
|
|
raise ValueError("Expression must be a single constructor call")
|
|
|
|
|
|
|
|
|
|
call = tree.body
|
|
|
|
|
if not (isinstance(call.func, ast.Name) and call.func.id in {"CrawlerRunConfig", "BrowserConfig"}):
|
|
|
|
|
raise ValueError("Only CrawlerRunConfig(...) or BrowserConfig(...) are allowed")
|
|
|
|
|
|
|
|
|
|
# forbid nested calls to keep the surface tiny
|
|
|
|
|
for node in ast.walk(call):
|
|
|
|
|
if isinstance(node, ast.Call) and node is not call:
|
|
|
|
|
raise ValueError("Nested function calls are not permitted")
|
|
|
|
|
|
|
|
|
|
# expose everything that crawl4ai exports, nothing else
|
|
|
|
|
safe_env = {name: getattr(_c4, name) for name in dir(_c4) if not name.startswith("_")}
|
|
|
|
|
obj = eval(compile(tree, "<config>", "eval"), {"__builtins__": {}}, safe_env)
|
|
|
|
|
return obj.dump()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ───────────────────────── Schemas ───────────────────────────
|
|
|
|
|
class CrawlRequest(BaseModel):
|
|
|
|
|
urls: List[str] = Field(min_length=1, max_length=100)
|
|
|
|
|
browser_config: Optional[Dict] = Field(default_factory=dict)
|
|
|
|
|
crawler_config: Optional[Dict] = Field(default_factory=dict)
|
2025-04-18 22:26:24 +08:00
|
|
|
|
|
2025-04-20 20:14:26 +08:00
|
|
|
|
class RawCode(BaseModel):
|
|
|
|
|
code: str
|
2025-04-18 22:26:24 +08:00
|
|
|
|
|
2025-04-20 20:14:26 +08:00
|
|
|
|
# ──────────────────────── Endpoints ──────────────────────────
|
2025-02-18 22:07:13 +08:00
|
|
|
|
@app.post("/token")
|
2025-04-20 20:14:26 +08:00
|
|
|
|
async def get_token(req: TokenRequest):
|
|
|
|
|
if not verify_email_domain(req.email):
|
|
|
|
|
raise HTTPException(400, "Invalid email domain")
|
|
|
|
|
token = create_access_token({"sub": req.email})
|
|
|
|
|
return {"email": req.email, "access_token": token, "token_type": "bearer"}
|
|
|
|
|
|
|
|
|
|
@app.post("/config/dump")
|
|
|
|
|
async def config_dump(raw: RawCode):
|
|
|
|
|
try:
|
|
|
|
|
return JSONResponse(_safe_eval_config(raw.code.strip()))
|
|
|
|
|
except Exception as e:
|
|
|
|
|
raise HTTPException(400, str(e))
|
2025-04-18 22:26:24 +08:00
|
|
|
|
|
|
|
|
|
|
2025-02-02 20:19:51 +08:00
|
|
|
|
@app.get("/md/{url:path}")
|
|
|
|
|
@limiter.limit(config["rate_limiting"]["default_limit"])
|
|
|
|
|
async def get_markdown(
|
|
|
|
|
request: Request,
|
|
|
|
|
url: str,
|
|
|
|
|
f: FilterType = FilterType.FIT,
|
|
|
|
|
q: Optional[str] = None,
|
2025-04-20 20:14:26 +08:00
|
|
|
|
c: str = "0",
|
|
|
|
|
_td: Dict = Depends(token_dep),
|
2025-02-02 20:19:51 +08:00
|
|
|
|
):
|
2025-04-20 20:14:26 +08:00
|
|
|
|
md = await handle_markdown_request(url, f, q, c, config)
|
|
|
|
|
return PlainTextResponse(md)
|
2025-04-18 22:26:24 +08:00
|
|
|
|
|
2025-04-20 20:14:26 +08:00
|
|
|
|
@app.get("/llm/{url:path}")
|
2025-02-02 20:19:51 +08:00
|
|
|
|
async def llm_endpoint(
|
|
|
|
|
request: Request,
|
2025-02-18 22:07:13 +08:00
|
|
|
|
url: str = Path(...),
|
|
|
|
|
q: Optional[str] = Query(None),
|
2025-04-20 20:14:26 +08:00
|
|
|
|
_td: Dict = Depends(token_dep),
|
2025-02-02 20:19:51 +08:00
|
|
|
|
):
|
2025-02-17 20:31:20 +08:00
|
|
|
|
if not q:
|
2025-04-20 20:14:26 +08:00
|
|
|
|
raise HTTPException(400, "Query parameter 'q' is required")
|
|
|
|
|
if not url.startswith(("http://", "https://")):
|
|
|
|
|
url = "https://" + url
|
|
|
|
|
answer = await handle_llm_qa(url, q, config)
|
|
|
|
|
return JSONResponse({"answer": answer})
|
2025-04-18 22:26:24 +08:00
|
|
|
|
|
2025-02-02 20:19:51 +08:00
|
|
|
|
@app.get("/schema")
|
|
|
|
|
async def get_schema():
|
|
|
|
|
from crawl4ai import BrowserConfig, CrawlerRunConfig
|
2025-04-20 20:14:26 +08:00
|
|
|
|
return {"browser": BrowserConfig().dump(),
|
|
|
|
|
"crawler": CrawlerRunConfig().dump()}
|
2025-04-18 22:26:24 +08:00
|
|
|
|
|
2025-02-02 20:19:51 +08:00
|
|
|
|
@app.get(config["observability"]["health_check"]["endpoint"])
|
|
|
|
|
async def health():
|
2025-02-18 22:07:13 +08:00
|
|
|
|
return {"status": "ok", "timestamp": time.time(), "version": __version__}
|
2025-02-02 20:19:51 +08:00
|
|
|
|
|
|
|
|
|
@app.get(config["observability"]["prometheus"]["endpoint"])
|
|
|
|
|
async def metrics():
|
2025-04-20 20:14:26 +08:00
|
|
|
|
return RedirectResponse(config["observability"]["prometheus"]["endpoint"])
|
2025-04-18 22:26:24 +08:00
|
|
|
|
|
2025-01-31 15:22:21 +08:00
|
|
|
|
@app.post("/crawl")
|
2025-02-02 20:19:51 +08:00
|
|
|
|
@limiter.limit(config["rate_limiting"]["default_limit"])
|
2025-02-18 22:07:13 +08:00
|
|
|
|
async def crawl(
|
|
|
|
|
request: Request,
|
|
|
|
|
crawl_request: CrawlRequest,
|
2025-04-20 20:14:26 +08:00
|
|
|
|
_td: Dict = Depends(token_dep),
|
2025-02-18 22:07:13 +08:00
|
|
|
|
):
|
|
|
|
|
if not crawl_request.urls:
|
2025-04-20 20:14:26 +08:00
|
|
|
|
raise HTTPException(400, "At least one URL required")
|
|
|
|
|
res = await handle_crawl_request(
|
|
|
|
|
urls=crawl_request.urls,
|
|
|
|
|
browser_config=crawl_request.browser_config,
|
|
|
|
|
crawler_config=crawl_request.crawler_config,
|
|
|
|
|
config=config,
|
|
|
|
|
)
|
|
|
|
|
return JSONResponse(res)
|
2025-01-31 15:22:21 +08:00
|
|
|
|
|
2025-02-18 22:07:13 +08:00
|
|
|
|
@app.post("/crawl/stream")
|
|
|
|
|
@limiter.limit(config["rate_limiting"]["default_limit"])
|
|
|
|
|
async def crawl_stream(
|
|
|
|
|
request: Request,
|
|
|
|
|
crawl_request: CrawlRequest,
|
2025-04-20 20:14:26 +08:00
|
|
|
|
_td: Dict = Depends(token_dep),
|
2025-02-18 22:07:13 +08:00
|
|
|
|
):
|
|
|
|
|
if not crawl_request.urls:
|
2025-04-20 20:14:26 +08:00
|
|
|
|
raise HTTPException(400, "At least one URL required")
|
|
|
|
|
crawler, gen = await handle_stream_crawl_request(
|
|
|
|
|
urls=crawl_request.urls,
|
|
|
|
|
browser_config=crawl_request.browser_config,
|
|
|
|
|
crawler_config=crawl_request.crawler_config,
|
|
|
|
|
config=config,
|
|
|
|
|
)
|
|
|
|
|
return StreamingResponse(
|
|
|
|
|
stream_results(crawler, gen),
|
|
|
|
|
media_type="application/x-ndjson",
|
|
|
|
|
headers={
|
|
|
|
|
"Cache-Control": "no-cache",
|
|
|
|
|
"Connection": "keep-alive",
|
|
|
|
|
"X-Stream-Status": "active",
|
|
|
|
|
},
|
|
|
|
|
)
|
2025-01-31 15:22:21 +08:00
|
|
|
|
|
2025-04-20 20:14:26 +08:00
|
|
|
|
# ────────────────────────── cli ──────────────────────────────
|
2025-01-31 15:22:21 +08:00
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
import uvicorn
|
2025-02-02 20:19:51 +08:00
|
|
|
|
uvicorn.run(
|
|
|
|
|
"server:app",
|
|
|
|
|
host=config["app"]["host"],
|
|
|
|
|
port=config["app"]["port"],
|
|
|
|
|
reload=config["app"]["reload"],
|
2025-04-20 20:14:26 +08:00
|
|
|
|
timeout_keep_alive=config["app"]["timeout_keep_alive"],
|
2025-04-18 22:26:24 +08:00
|
|
|
|
)
|
2025-04-20 20:14:26 +08:00
|
|
|
|
# ─────────────────────────────────────────────────────────────
|