2025-02-01 14:28:28 +08:00
|
|
|
import os
|
|
|
|
import sys
|
2025-02-02 20:19:51 +08:00
|
|
|
import time
|
2025-02-18 22:07:13 +08:00
|
|
|
from typing import List, Optional, Dict
|
|
|
|
from fastapi import FastAPI, HTTPException, Request, Query, Path, Depends
|
|
|
|
from fastapi.responses import StreamingResponse, RedirectResponse, PlainTextResponse, JSONResponse
|
2025-02-02 20:19:51 +08:00
|
|
|
from fastapi.middleware.httpsredirect import HTTPSRedirectMiddleware
|
|
|
|
from fastapi.middleware.trustedhost import TrustedHostMiddleware
|
|
|
|
from pydantic import BaseModel, Field
|
|
|
|
from slowapi import Limiter
|
|
|
|
from slowapi.util import get_remote_address
|
|
|
|
from prometheus_fastapi_instrumentator import Instrumentator
|
2025-02-18 22:07:13 +08:00
|
|
|
from redis import asyncio as aioredis
|
2025-02-02 20:19:51 +08:00
|
|
|
|
2025-02-18 22:07:13 +08:00
|
|
|
sys.path.append(os.path.dirname(os.path.realpath(__file__)))
|
|
|
|
from utils import FilterType, load_config, setup_logging, verify_email_domain
|
2025-02-02 20:19:51 +08:00
|
|
|
from api import (
|
|
|
|
handle_markdown_request,
|
2025-02-18 22:07:13 +08:00
|
|
|
handle_llm_qa,
|
|
|
|
handle_stream_crawl_request,
|
|
|
|
handle_crawl_request,
|
|
|
|
stream_results
|
2025-01-31 15:22:21 +08:00
|
|
|
)
|
2025-02-18 22:07:13 +08:00
|
|
|
from auth import create_access_token, get_token_dependency, TokenRequest # Import from auth.py
|
|
|
|
|
|
|
|
__version__ = "0.2.6"
|
|
|
|
|
|
|
|
class CrawlRequest(BaseModel):
|
|
|
|
urls: List[str] = Field(min_length=1, max_length=100)
|
|
|
|
browser_config: Optional[Dict] = Field(default_factory=dict)
|
|
|
|
crawler_config: Optional[Dict] = Field(default_factory=dict)
|
2025-02-01 20:10:13 +08:00
|
|
|
|
2025-02-02 20:19:51 +08:00
|
|
|
# Load configuration and setup
|
|
|
|
config = load_config()
|
|
|
|
setup_logging(config)
|
2025-02-01 20:10:13 +08:00
|
|
|
|
2025-02-02 20:19:51 +08:00
|
|
|
# Initialize Redis
|
|
|
|
redis = aioredis.from_url(config["redis"].get("uri", "redis://localhost"))
|
2025-02-01 20:10:13 +08:00
|
|
|
|
2025-02-02 20:19:51 +08:00
|
|
|
# Initialize rate limiter
|
|
|
|
limiter = Limiter(
|
|
|
|
key_func=get_remote_address,
|
|
|
|
default_limits=[config["rate_limiting"]["default_limit"]],
|
|
|
|
storage_uri=config["rate_limiting"]["storage_uri"]
|
|
|
|
)
|
2025-02-01 20:10:13 +08:00
|
|
|
|
2025-02-02 20:19:51 +08:00
|
|
|
app = FastAPI(
|
|
|
|
title=config["app"]["title"],
|
|
|
|
version=config["app"]["version"]
|
|
|
|
)
|
2025-01-31 15:22:21 +08:00
|
|
|
|
2025-02-02 20:19:51 +08:00
|
|
|
# Configure middleware
|
2025-02-18 22:07:13 +08:00
|
|
|
def setup_security_middleware(app, config):
|
|
|
|
sec_config = config.get("security", {})
|
|
|
|
if sec_config.get("enabled", False):
|
|
|
|
if sec_config.get("https_redirect", False):
|
|
|
|
app.add_middleware(HTTPSRedirectMiddleware)
|
|
|
|
if sec_config.get("trusted_hosts", []) != ["*"]:
|
|
|
|
app.add_middleware(TrustedHostMiddleware, allowed_hosts=sec_config["trusted_hosts"])
|
|
|
|
|
|
|
|
setup_security_middleware(app, config)
|
2025-01-31 15:22:21 +08:00
|
|
|
|
2025-02-02 20:19:51 +08:00
|
|
|
# Prometheus instrumentation
|
|
|
|
if config["observability"]["prometheus"]["enabled"]:
|
|
|
|
Instrumentator().instrument(app).expose(app)
|
2025-01-31 18:00:16 +08:00
|
|
|
|
2025-02-18 22:07:13 +08:00
|
|
|
# Get token dependency based on config
|
|
|
|
token_dependency = get_token_dependency(config)
|
2025-02-02 20:19:51 +08:00
|
|
|
|
2025-02-18 22:07:13 +08:00
|
|
|
# Middleware for security headers
|
2025-02-02 20:19:51 +08:00
|
|
|
@app.middleware("http")
|
|
|
|
async def add_security_headers(request: Request, call_next):
|
|
|
|
response = await call_next(request)
|
|
|
|
if config["security"]["enabled"]:
|
|
|
|
response.headers.update(config["security"]["headers"])
|
|
|
|
return response
|
|
|
|
|
2025-02-18 22:07:13 +08:00
|
|
|
# Token endpoint (always available, but usage depends on config)
|
|
|
|
@app.post("/token")
|
|
|
|
async def get_token(request_data: TokenRequest):
|
|
|
|
if not verify_email_domain(request_data.email):
|
|
|
|
raise HTTPException(status_code=400, detail="Invalid email domain")
|
|
|
|
token = create_access_token({"sub": request_data.email})
|
|
|
|
return {"email": request_data.email, "access_token": token, "token_type": "bearer"}
|
|
|
|
|
|
|
|
# Endpoints with conditional auth
|
2025-02-02 20:19:51 +08:00
|
|
|
@app.get("/md/{url:path}")
|
|
|
|
@limiter.limit(config["rate_limiting"]["default_limit"])
|
|
|
|
async def get_markdown(
|
|
|
|
request: Request,
|
|
|
|
url: str,
|
|
|
|
f: FilterType = FilterType.FIT,
|
|
|
|
q: Optional[str] = None,
|
2025-02-18 22:07:13 +08:00
|
|
|
c: Optional[str] = "0",
|
2025-03-05 17:14:42 +08:00
|
|
|
token_data: Optional[Dict] = Depends(token_dependency)
|
2025-02-02 20:19:51 +08:00
|
|
|
):
|
|
|
|
result = await handle_markdown_request(url, f, q, c, config)
|
|
|
|
return PlainTextResponse(result)
|
|
|
|
|
2025-02-17 20:31:20 +08:00
|
|
|
@app.get("/llm/{url:path}", description="URL should be without http/https prefix")
|
2025-02-02 20:19:51 +08:00
|
|
|
async def llm_endpoint(
|
|
|
|
request: Request,
|
2025-02-18 22:07:13 +08:00
|
|
|
url: str = Path(...),
|
|
|
|
q: Optional[str] = Query(None),
|
|
|
|
token_data: Optional[Dict] = Depends(token_dependency)
|
2025-02-02 20:19:51 +08:00
|
|
|
):
|
2025-02-17 20:31:20 +08:00
|
|
|
if not q:
|
2025-02-18 22:07:13 +08:00
|
|
|
raise HTTPException(status_code=400, detail="Query parameter 'q' is required")
|
2025-02-17 20:31:20 +08:00
|
|
|
if not url.startswith(('http://', 'https://')):
|
|
|
|
url = 'https://' + url
|
|
|
|
try:
|
|
|
|
answer = await handle_llm_qa(url, q, config)
|
|
|
|
return JSONResponse({"answer": answer})
|
|
|
|
except Exception as e:
|
2025-02-18 22:07:13 +08:00
|
|
|
raise HTTPException(status_code=500, detail=str(e))
|
2025-02-17 20:31:20 +08:00
|
|
|
|
2025-02-02 20:19:51 +08:00
|
|
|
@app.get("/schema")
|
|
|
|
async def get_schema():
|
|
|
|
from crawl4ai import BrowserConfig, CrawlerRunConfig
|
2025-02-18 22:07:13 +08:00
|
|
|
return {"browser": BrowserConfig().dump(), "crawler": CrawlerRunConfig().dump()}
|
2025-02-02 20:19:51 +08:00
|
|
|
|
|
|
|
@app.get(config["observability"]["health_check"]["endpoint"])
|
|
|
|
async def health():
|
2025-02-18 22:07:13 +08:00
|
|
|
return {"status": "ok", "timestamp": time.time(), "version": __version__}
|
2025-02-02 20:19:51 +08:00
|
|
|
|
|
|
|
@app.get(config["observability"]["prometheus"]["endpoint"])
|
|
|
|
async def metrics():
|
|
|
|
return RedirectResponse(url=config["observability"]["prometheus"]["endpoint"])
|
2025-01-31 15:22:21 +08:00
|
|
|
|
|
|
|
@app.post("/crawl")
|
2025-02-02 20:19:51 +08:00
|
|
|
@limiter.limit(config["rate_limiting"]["default_limit"])
|
2025-02-18 22:07:13 +08:00
|
|
|
async def crawl(
|
|
|
|
request: Request,
|
|
|
|
crawl_request: CrawlRequest,
|
|
|
|
token_data: Optional[Dict] = Depends(token_dependency)
|
|
|
|
):
|
|
|
|
if not crawl_request.urls:
|
|
|
|
raise HTTPException(status_code=400, detail="At least one URL required")
|
|
|
|
|
|
|
|
results = await handle_crawl_request(
|
|
|
|
urls=crawl_request.urls,
|
|
|
|
browser_config=crawl_request.browser_config,
|
|
|
|
crawler_config=crawl_request.crawler_config,
|
|
|
|
config=config
|
2025-01-31 15:22:21 +08:00
|
|
|
)
|
2025-02-02 20:19:51 +08:00
|
|
|
|
2025-02-18 22:07:13 +08:00
|
|
|
return JSONResponse(results)
|
2025-01-31 15:22:21 +08:00
|
|
|
|
|
|
|
|
2025-02-18 22:07:13 +08:00
|
|
|
@app.post("/crawl/stream")
|
|
|
|
@limiter.limit(config["rate_limiting"]["default_limit"])
|
|
|
|
async def crawl_stream(
|
|
|
|
request: Request,
|
|
|
|
crawl_request: CrawlRequest,
|
|
|
|
token_data: Optional[Dict] = Depends(token_dependency)
|
|
|
|
):
|
|
|
|
if not crawl_request.urls:
|
|
|
|
raise HTTPException(status_code=400, detail="At least one URL required")
|
|
|
|
|
|
|
|
crawler, results_gen = await handle_stream_crawl_request(
|
|
|
|
urls=crawl_request.urls,
|
|
|
|
browser_config=crawl_request.browser_config,
|
|
|
|
crawler_config=crawl_request.crawler_config,
|
|
|
|
config=config
|
|
|
|
)
|
2025-02-02 20:19:51 +08:00
|
|
|
|
2025-02-18 22:07:13 +08:00
|
|
|
return StreamingResponse(
|
|
|
|
stream_results(crawler, results_gen),
|
|
|
|
media_type='application/x-ndjson',
|
|
|
|
headers={'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'X-Stream-Status': 'active'}
|
|
|
|
)
|
2025-01-31 15:22:21 +08:00
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
import uvicorn
|
2025-02-02 20:19:51 +08:00
|
|
|
uvicorn.run(
|
|
|
|
"server:app",
|
|
|
|
host=config["app"]["host"],
|
|
|
|
port=config["app"]["port"],
|
|
|
|
reload=config["app"]["reload"],
|
|
|
|
timeout_keep_alive=config["app"]["timeout_keep_alive"]
|
|
|
|
)
|