feat(mcp): Implement MCP protocol and enhance server capabilities
This commit introduces several significant enhancements to the Crawl4AI Docker deployment: 1. Add MCP Protocol Support: - Implement WebSocket and SSE transport layers for MCP server communication - Create mcp_bridge.py to expose existing API endpoints via MCP protocol - Add comprehensive tests for both socket and SSE transport methods 2. Enhance Docker Server Capabilities: - Add PDF generation endpoint with file saving functionality - Add screenshot capture endpoint with configurable wait time - Implement JavaScript execution endpoint for dynamic page interaction - Add intelligent file path handling for saving generated assets 3. Improve Search and Context Functionality: - Implement syntax-aware code function chunking using AST parsing - Add BM25-based intelligent document search with relevance scoring - Create separate code and documentation context endpoints - Enhance response format with structured results and scores 4. Rename and Fix File Organization: - Fix typo in test_docker_config_gen.py filename - Update import statements and dependencies - Add FileResponse for context endpoints This enhancement significantly improves the machine-to-machine communication capabilities of Crawl4AI, making it more suitable for integration with LLM agents and other automated systems. The CHANGELOG update has been applied successfully, highlighting the key features and improvements made in this release. The commit message provides a detailed explanation of all the changes, which will be helpful for tracking the project's evolution.
This commit is contained in:
parent
a58c8000aa
commit
5297e362f3
24
CHANGELOG.md
24
CHANGELOG.md
@ -5,6 +5,30 @@ All notable changes to Crawl4AI will be documented in this file.
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
### [Feature] 2025-04-21
|
||||
- Implemented MCP protocol for machine-to-machine communication
|
||||
- Added WebSocket and SSE transport for MCP server
|
||||
- Exposed server endpoints via MCP protocol
|
||||
- Created tests for MCP socket and SSE communication
|
||||
- Enhanced Docker server with file handling and intelligent search
|
||||
- Added PDF and screenshot endpoints with file saving capability
|
||||
- Added JavaScript execution endpoint for page interaction
|
||||
- Implemented advanced context search with BM25 and code chunking
|
||||
- Added file path output support for generated assets
|
||||
- Improved server endpoints and API surface
|
||||
- Added intelligent context search with query filtering
|
||||
- Added syntax-aware code function chunking
|
||||
- Implemented efficient HTML processing pipeline
|
||||
|
||||
### [Refactor] 2025-04-20
|
||||
- Replaced crawler_manager.py with simpler crawler_pool.py implementation
|
||||
- Added global page semaphore for hard concurrency cap
|
||||
- Implemented browser pool with idle cleanup
|
||||
- Added playground UI for testing and stress testing
|
||||
- Updated API handlers to use pooled crawlers
|
||||
- Enhanced logging levels and symbols
|
||||
- Added memory tests and stress test utilities
|
||||
|
||||
### [Added] 2025-04-17
|
||||
- Added content source selection feature for markdown generation
|
||||
- New `content_source` parameter allows choosing between `cleaned_html`, `raw_html`, and `fit_html`
|
||||
|
11631
deploy/docker/c4ai-code-context.md
Normal file
11631
deploy/docker/c4ai-code-context.md
Normal file
File diff suppressed because it is too large
Load Diff
8899
deploy/docker/c4ai-doc-context.md
Normal file
8899
deploy/docker/c4ai-doc-context.md
Normal file
File diff suppressed because it is too large
Load Diff
252
deploy/docker/mcp_bridge.py
Normal file
252
deploy/docker/mcp_bridge.py
Normal file
@ -0,0 +1,252 @@
|
||||
# deploy/docker/mcp_bridge.py
|
||||
|
||||
from __future__ import annotations
|
||||
import inspect, json, re, anyio
|
||||
from contextlib import suppress
|
||||
from typing import Any, Callable, Dict, List, Tuple
|
||||
import httpx
|
||||
|
||||
from fastapi import FastAPI, WebSocket, WebSocketDisconnect, HTTPException
|
||||
from fastapi.responses import JSONResponse
|
||||
from fastapi import Request
|
||||
from sse_starlette.sse import EventSourceResponse
|
||||
from pydantic import BaseModel
|
||||
from mcp.server.sse import SseServerTransport
|
||||
|
||||
import mcp.types as t
|
||||
from mcp.server.lowlevel.server import Server, NotificationOptions
|
||||
from mcp.server.models import InitializationOptions
|
||||
|
||||
# ── opt‑in decorators ───────────────────────────────────────────
|
||||
def mcp_resource(name: str | None = None):
|
||||
def deco(fn):
|
||||
fn.__mcp_kind__, fn.__mcp_name__ = "resource", name
|
||||
return fn
|
||||
return deco
|
||||
|
||||
def mcp_template(name: str | None = None):
|
||||
def deco(fn):
|
||||
fn.__mcp_kind__, fn.__mcp_name__ = "template", name
|
||||
return fn
|
||||
return deco
|
||||
|
||||
def mcp_tool(name: str | None = None):
|
||||
def deco(fn):
|
||||
fn.__mcp_kind__, fn.__mcp_name__ = "tool", name
|
||||
return fn
|
||||
return deco
|
||||
|
||||
# ── HTTP‑proxy helper for FastAPI endpoints ─────────────────────
|
||||
def _make_http_proxy(base_url: str, route):
|
||||
method = list(route.methods - {"HEAD", "OPTIONS"})[0]
|
||||
async def proxy(**kwargs):
|
||||
# replace `/items/{id}` style params first
|
||||
path = route.path
|
||||
for k, v in list(kwargs.items()):
|
||||
placeholder = "{" + k + "}"
|
||||
if placeholder in path:
|
||||
path = path.replace(placeholder, str(v))
|
||||
kwargs.pop(k)
|
||||
url = base_url.rstrip("/") + path
|
||||
|
||||
async with httpx.AsyncClient() as client:
|
||||
try:
|
||||
r = (
|
||||
await client.get(url, params=kwargs)
|
||||
if method == "GET"
|
||||
else await client.request(method, url, json=kwargs)
|
||||
)
|
||||
r.raise_for_status()
|
||||
return r.text if method == "GET" else r.json()
|
||||
except httpx.HTTPStatusError as e:
|
||||
# surface FastAPI error details instead of plain 500
|
||||
raise HTTPException(e.response.status_code, e.response.text)
|
||||
return proxy
|
||||
|
||||
# ── main entry point ────────────────────────────────────────────
|
||||
def attach_mcp(
|
||||
app: FastAPI,
|
||||
*, # keyword‑only
|
||||
base: str = "/mcp",
|
||||
name: str | None = None,
|
||||
base_url: str, # eg. "http://127.0.0.1:8020"
|
||||
) -> None:
|
||||
"""Call once after all routes are declared to expose WS+SSE MCP endpoints."""
|
||||
server_name = name or app.title or "FastAPI-MCP"
|
||||
mcp = Server(server_name)
|
||||
|
||||
# tools: Dict[str, Callable] = {}
|
||||
tools: Dict[str, Tuple[Callable, Callable]] = {}
|
||||
resources: Dict[str, Callable] = {}
|
||||
templates: Dict[str, Callable] = {}
|
||||
|
||||
# register decorated FastAPI routes
|
||||
for route in app.routes:
|
||||
fn = getattr(route, "endpoint", None)
|
||||
kind = getattr(fn, "__mcp_kind__", None)
|
||||
if not kind:
|
||||
continue
|
||||
|
||||
key = fn.__mcp_name__ or re.sub(r"[/{}}]", "_", route.path).strip("_")
|
||||
|
||||
# if kind == "tool":
|
||||
# tools[key] = _make_http_proxy(base_url, route)
|
||||
if kind == "tool":
|
||||
proxy = _make_http_proxy(base_url, route)
|
||||
tools[key] = (proxy, fn)
|
||||
continue
|
||||
if kind == "resource":
|
||||
resources[key] = fn
|
||||
if kind == "template":
|
||||
templates[key] = fn
|
||||
|
||||
# helpers for JSON‑Schema
|
||||
def _schema(model: type[BaseModel] | None) -> dict:
|
||||
return {"type": "object"} if model is None else model.model_json_schema()
|
||||
|
||||
def _body_model(fn: Callable) -> type[BaseModel] | None:
|
||||
for p in inspect.signature(fn).parameters.values():
|
||||
a = p.annotation
|
||||
if inspect.isclass(a) and issubclass(a, BaseModel):
|
||||
return a
|
||||
return None
|
||||
|
||||
# MCP handlers
|
||||
@mcp.list_tools()
|
||||
async def _list_tools() -> List[t.Tool]:
|
||||
out = []
|
||||
for k, (proxy, orig_fn) in tools.items():
|
||||
desc = getattr(orig_fn, "__mcp_description__", None) or inspect.getdoc(orig_fn) or ""
|
||||
schema = getattr(orig_fn, "__mcp_schema__", None) or _schema(_body_model(orig_fn))
|
||||
out.append(
|
||||
t.Tool(name=k, description=desc, inputSchema=schema)
|
||||
)
|
||||
return out
|
||||
|
||||
|
||||
@mcp.call_tool()
|
||||
async def _call_tool(name: str, arguments: Dict | None) -> List[t.TextContent]:
|
||||
if name not in tools:
|
||||
raise HTTPException(404, "tool not found")
|
||||
|
||||
proxy, _ = tools[name]
|
||||
try:
|
||||
res = await proxy(**(arguments or {}))
|
||||
except HTTPException as exc:
|
||||
# map server‑side errors into MCP "text/error" payloads
|
||||
err = {"error": exc.status_code, "detail": exc.detail}
|
||||
return [t.TextContent(type = "text", text=json.dumps(err))]
|
||||
return [t.TextContent(type = "text", text=json.dumps(res, default=str))]
|
||||
|
||||
@mcp.list_resources()
|
||||
async def _list_resources() -> List[t.Resource]:
|
||||
return [
|
||||
t.Resource(name=k, description=inspect.getdoc(f) or "", mime_type="application/json")
|
||||
for k, f in resources.items()
|
||||
]
|
||||
|
||||
@mcp.read_resource()
|
||||
async def _read_resource(name: str) -> List[t.TextContent]:
|
||||
if name not in resources:
|
||||
raise HTTPException(404, "resource not found")
|
||||
res = resources[name]()
|
||||
return [t.TextContent(type = "text", text=json.dumps(res, default=str))]
|
||||
|
||||
@mcp.list_resource_templates()
|
||||
async def _list_templates() -> List[t.ResourceTemplate]:
|
||||
return [
|
||||
t.ResourceTemplate(
|
||||
name=k,
|
||||
description=inspect.getdoc(f) or "",
|
||||
parameters={
|
||||
p: {"type": "string"} for p in _path_params(app, f)
|
||||
},
|
||||
)
|
||||
for k, f in templates.items()
|
||||
]
|
||||
|
||||
init_opts = InitializationOptions(
|
||||
server_name=server_name,
|
||||
server_version="0.1.0",
|
||||
capabilities=mcp.get_capabilities(
|
||||
notification_options=NotificationOptions(),
|
||||
experimental_capabilities={},
|
||||
),
|
||||
)
|
||||
|
||||
# ── WebSocket transport ────────────────────────────────────
|
||||
@app.websocket_route(f"{base}/ws")
|
||||
async def _ws(ws: WebSocket):
|
||||
await ws.accept()
|
||||
c2s_send, c2s_recv = anyio.create_memory_object_stream(100)
|
||||
s2c_send, s2c_recv = anyio.create_memory_object_stream(100)
|
||||
|
||||
from pydantic import TypeAdapter
|
||||
from mcp.types import JSONRPCMessage
|
||||
adapter = TypeAdapter(JSONRPCMessage)
|
||||
|
||||
init_done = anyio.Event()
|
||||
|
||||
async def srv_to_ws():
|
||||
first = True
|
||||
try:
|
||||
async for msg in s2c_recv:
|
||||
await ws.send_json(msg.model_dump())
|
||||
if first:
|
||||
init_done.set()
|
||||
first = False
|
||||
finally:
|
||||
# make sure cleanup survives TaskGroup cancellation
|
||||
with anyio.CancelScope(shield=True):
|
||||
with suppress(RuntimeError): # idempotent close
|
||||
await ws.close()
|
||||
|
||||
async def ws_to_srv():
|
||||
try:
|
||||
# 1st frame is always "initialize"
|
||||
first = adapter.validate_python(await ws.receive_json())
|
||||
await c2s_send.send(first)
|
||||
await init_done.wait() # block until server ready
|
||||
while True:
|
||||
data = await ws.receive_json()
|
||||
await c2s_send.send(adapter.validate_python(data))
|
||||
except WebSocketDisconnect:
|
||||
await c2s_send.aclose()
|
||||
|
||||
async with anyio.create_task_group() as tg:
|
||||
tg.start_soon(mcp.run, c2s_recv, s2c_send, init_opts)
|
||||
tg.start_soon(ws_to_srv)
|
||||
tg.start_soon(srv_to_ws)
|
||||
|
||||
# ── SSE transport (official) ─────────────────────────────
|
||||
sse = SseServerTransport(f"{base}/messages/")
|
||||
|
||||
@app.get(f"{base}/sse")
|
||||
async def _mcp_sse(request: Request):
|
||||
async with sse.connect_sse(
|
||||
request.scope, request.receive, request._send # starlette ASGI primitives
|
||||
) as (read_stream, write_stream):
|
||||
await mcp.run(read_stream, write_stream, init_opts)
|
||||
|
||||
# client → server frames are POSTed here
|
||||
app.mount(f"{base}/messages", app=sse.handle_post_message)
|
||||
|
||||
# ── schema endpoint ───────────────────────────────────────
|
||||
@app.get(f"{base}/schema")
|
||||
async def _schema_endpoint():
|
||||
return JSONResponse({
|
||||
"tools": [x.model_dump() for x in await _list_tools()],
|
||||
"resources": [x.model_dump() for x in await _list_resources()],
|
||||
"resource_templates": [x.model_dump() for x in await _list_templates()],
|
||||
})
|
||||
|
||||
|
||||
# ── helpers ────────────────────────────────────────────────────
|
||||
def _route_name(path: str) -> str:
|
||||
return re.sub(r"[/{}}]", "_", path).strip("_")
|
||||
|
||||
def _path_params(app: FastAPI, fn: Callable) -> List[str]:
|
||||
for r in app.routes:
|
||||
if r.endpoint is fn:
|
||||
return list(r.param_convertors.keys())
|
||||
return []
|
@ -1,9 +1,15 @@
|
||||
fastapi
|
||||
uvicorn
|
||||
fastapi==0.115.12
|
||||
uvicorn==0.34.2
|
||||
gunicorn>=23.0.0
|
||||
slowapi>=0.1.9
|
||||
prometheus-fastapi-instrumentator>=7.0.2
|
||||
slowapi==0.1.9
|
||||
prometheus-fastapi-instrumentator>=7.1.0
|
||||
redis>=5.2.1
|
||||
jwt>=1.3.1
|
||||
dnspython>=2.7.0
|
||||
email-validator>=2.2.0
|
||||
email-validator==2.2.0
|
||||
sse-starlette==2.2.1
|
||||
pydantic==2.11
|
||||
rank-bm25==0.2.2
|
||||
anyio==4.9.0
|
||||
PyJWT==2.10.1
|
||||
|
||||
|
@ -7,14 +7,47 @@ Crawl4AI FastAPI entry‑point
|
||||
"""
|
||||
|
||||
# ── stdlib & 3rd‑party imports ───────────────────────────────
|
||||
import os, sys, time, asyncio
|
||||
from typing import List, Optional, Dict
|
||||
from crawler_pool import get_crawler, close_all, janitor
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||
from auth import create_access_token, get_token_dependency, TokenRequest
|
||||
from pydantic import BaseModel
|
||||
from typing import Optional, List, Dict
|
||||
from fastapi import Request, Depends
|
||||
from fastapi.responses import FileResponse
|
||||
import base64
|
||||
import re
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||
from api import (
|
||||
handle_markdown_request, handle_llm_qa,
|
||||
handle_stream_crawl_request, handle_crawl_request,
|
||||
stream_results
|
||||
)
|
||||
from utils import (
|
||||
FilterType, load_config, setup_logging, verify_email_domain
|
||||
)
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import asyncio
|
||||
from typing import List
|
||||
from contextlib import asynccontextmanager
|
||||
import pathlib
|
||||
|
||||
from fastapi import (
|
||||
FastAPI, HTTPException, Request, Path, Query, Depends
|
||||
)
|
||||
from rank_bm25 import BM25Okapi
|
||||
|
||||
def chunk_code_functions(code: str) -> List[str]:
|
||||
tree = ast.parse(code)
|
||||
lines = code.splitlines()
|
||||
chunks = []
|
||||
for node in tree.body:
|
||||
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
|
||||
start = node.lineno - 1
|
||||
end = getattr(node, 'end_lineno', start + 1)
|
||||
chunks.append("\n".join(lines[start:end]))
|
||||
return chunks
|
||||
from fastapi.responses import (
|
||||
StreamingResponse, RedirectResponse, PlainTextResponse, JSONResponse
|
||||
)
|
||||
@ -22,7 +55,10 @@ from fastapi.middleware.httpsredirect import HTTPSRedirectMiddleware
|
||||
from fastapi.middleware.trustedhost import TrustedHostMiddleware
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
|
||||
import ast, crawl4ai as _c4
|
||||
from mcp_bridge import attach_mcp, mcp_resource, mcp_template, mcp_tool
|
||||
|
||||
import ast
|
||||
import crawl4ai as _c4
|
||||
from pydantic import BaseModel, Field
|
||||
from slowapi import Limiter
|
||||
from slowapi.util import get_remote_address
|
||||
@ -31,17 +67,6 @@ from redis import asyncio as aioredis
|
||||
|
||||
# ── internal imports (after sys.path append) ─────────────────
|
||||
sys.path.append(os.path.dirname(os.path.realpath(__file__)))
|
||||
from utils import (
|
||||
FilterType, load_config, setup_logging, verify_email_domain
|
||||
)
|
||||
from api import (
|
||||
handle_markdown_request, handle_llm_qa,
|
||||
handle_stream_crawl_request, handle_crawl_request,
|
||||
stream_results
|
||||
)
|
||||
from auth import create_access_token, get_token_dependency, TokenRequest
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||
from crawler_pool import get_crawler, close_all, janitor
|
||||
|
||||
# ────────────────── configuration / logging ──────────────────
|
||||
config = load_config()
|
||||
@ -66,12 +91,16 @@ GLOBAL_SEM = asyncio.Semaphore(MAX_PAGES)
|
||||
# GLOBAL_SEM.release() # ← free slot
|
||||
|
||||
orig_arun = AsyncWebCrawler.arun
|
||||
|
||||
|
||||
async def capped_arun(self, *a, **kw):
|
||||
async with GLOBAL_SEM:
|
||||
return await orig_arun(self, *a, **kw)
|
||||
AsyncWebCrawler.arun = capped_arun
|
||||
|
||||
# ───────────────────── FastAPI lifespan ──────────────────────
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(_: FastAPI):
|
||||
await get_crawler(BrowserConfig(
|
||||
@ -101,6 +130,8 @@ app.mount(
|
||||
)
|
||||
|
||||
# Optional nice‑to‑have: opening the root shows the playground
|
||||
|
||||
|
||||
@app.get("/")
|
||||
async def root():
|
||||
return RedirectResponse("/playground")
|
||||
@ -114,6 +145,7 @@ limiter = Limiter(
|
||||
storage_uri=config["rate_limiting"]["storage_uri"],
|
||||
)
|
||||
|
||||
|
||||
def _setup_security(app_: FastAPI):
|
||||
sec = config["security"]
|
||||
if not sec["enabled"]:
|
||||
@ -124,6 +156,8 @@ def _setup_security(app_: FastAPI):
|
||||
app_.add_middleware(
|
||||
TrustedHostMiddleware, allowed_hosts=sec["trusted_hosts"]
|
||||
)
|
||||
|
||||
|
||||
_setup_security(app)
|
||||
|
||||
if config["observability"]["prometheus"]["enabled"]:
|
||||
@ -131,6 +165,7 @@ if config["observability"]["prometheus"]["enabled"]:
|
||||
|
||||
token_dep = get_token_dependency(config)
|
||||
|
||||
|
||||
@app.middleware("http")
|
||||
async def add_security_headers(request: Request, call_next):
|
||||
resp = await call_next(request)
|
||||
@ -144,6 +179,7 @@ ALLOWED_TYPES = {
|
||||
"BrowserConfig": BrowserConfig,
|
||||
}
|
||||
|
||||
|
||||
def _safe_eval_config(expr: str) -> dict:
|
||||
"""
|
||||
Accept exactly one top‑level call to CrawlerRunConfig(...) or BrowserConfig(...).
|
||||
@ -159,7 +195,8 @@ def _safe_eval_config(expr: str) -> dict:
|
||||
|
||||
call = tree.body
|
||||
if not (isinstance(call.func, ast.Name) and call.func.id in {"CrawlerRunConfig", "BrowserConfig"}):
|
||||
raise ValueError("Only CrawlerRunConfig(...) or BrowserConfig(...) are allowed")
|
||||
raise ValueError(
|
||||
"Only CrawlerRunConfig(...) or BrowserConfig(...) are allowed")
|
||||
|
||||
# forbid nested calls to keep the surface tiny
|
||||
for node in ast.walk(call):
|
||||
@ -167,8 +204,10 @@ def _safe_eval_config(expr: str) -> dict:
|
||||
raise ValueError("Nested function calls are not permitted")
|
||||
|
||||
# expose everything that crawl4ai exports, nothing else
|
||||
safe_env = {name: getattr(_c4, name) for name in dir(_c4) if not name.startswith("_")}
|
||||
obj = eval(compile(tree, "<config>", "eval"), {"__builtins__": {}}, safe_env)
|
||||
safe_env = {name: getattr(_c4, name)
|
||||
for name in dir(_c4) if not name.startswith("_")}
|
||||
obj = eval(compile(tree, "<config>", "eval"),
|
||||
{"__builtins__": {}}, safe_env)
|
||||
return obj.dump()
|
||||
|
||||
|
||||
@ -178,10 +217,42 @@ class CrawlRequest(BaseModel):
|
||||
browser_config: Optional[Dict] = Field(default_factory=dict)
|
||||
crawler_config: Optional[Dict] = Field(default_factory=dict)
|
||||
|
||||
# ────────────── Schemas ──────────────
|
||||
class MarkdownRequest(BaseModel):
|
||||
"""Request body for the /md endpoint."""
|
||||
url: str = Field(..., description="Absolute http/https URL to fetch")
|
||||
f: FilterType = Field(FilterType.FIT,
|
||||
description="Content‑filter strategy: FIT, RAW, BM25, or LLM")
|
||||
q: Optional[str] = Field(None, description="Query string used by BM25/LLM filters")
|
||||
c: Optional[str] = Field("0", description="Cache‑bust / revision counter")
|
||||
|
||||
|
||||
class RawCode(BaseModel):
|
||||
code: str
|
||||
|
||||
class HTMLRequest(BaseModel):
|
||||
url: str
|
||||
|
||||
class ScreenshotRequest(BaseModel):
|
||||
url: str
|
||||
screenshot_wait_for: Optional[float] = 2
|
||||
output_path: Optional[str] = None
|
||||
|
||||
class PDFRequest(BaseModel):
|
||||
url: str
|
||||
output_path: Optional[str] = None
|
||||
|
||||
|
||||
class JSEndpointRequest(BaseModel):
|
||||
url: str
|
||||
scripts: List[str] = Field(
|
||||
...,
|
||||
description="List of separated JavaScript snippets to execute"
|
||||
)
|
||||
|
||||
# ──────────────────────── Endpoints ──────────────────────────
|
||||
|
||||
|
||||
@app.post("/token")
|
||||
async def get_token(req: TokenRequest):
|
||||
if not verify_email_domain(req.email):
|
||||
@ -189,6 +260,7 @@ async def get_token(req: TokenRequest):
|
||||
token = create_access_token({"sub": req.email})
|
||||
return {"email": req.email, "access_token": token, "token_type": "bearer"}
|
||||
|
||||
|
||||
@app.post("/config/dump")
|
||||
async def config_dump(raw: RawCode):
|
||||
try:
|
||||
@ -197,18 +269,164 @@ async def config_dump(raw: RawCode):
|
||||
raise HTTPException(400, str(e))
|
||||
|
||||
|
||||
@app.get("/md/{url:path}")
|
||||
@app.post("/md")
|
||||
@limiter.limit(config["rate_limiting"]["default_limit"])
|
||||
@mcp_tool("md")
|
||||
async def get_markdown(
|
||||
request: Request,
|
||||
url: str,
|
||||
f: FilterType = FilterType.FIT,
|
||||
q: Optional[str] = None,
|
||||
c: str = "0",
|
||||
body: MarkdownRequest,
|
||||
_td: Dict = Depends(token_dep),
|
||||
):
|
||||
md = await handle_markdown_request(url, f, q, c, config)
|
||||
return PlainTextResponse(md)
|
||||
if not body.url.startswith(("http://", "https://")):
|
||||
raise HTTPException(400, "URL must be absolute and start with http/https")
|
||||
markdown = await handle_markdown_request(
|
||||
body.url, body.f, body.q, body.c, config
|
||||
)
|
||||
return JSONResponse({
|
||||
"url": body.url,
|
||||
"filter": body.f,
|
||||
"query": body.q,
|
||||
"cache": body.c,
|
||||
"markdown": markdown,
|
||||
"success": True
|
||||
})
|
||||
|
||||
|
||||
@app.post("/html")
|
||||
@limiter.limit(config["rate_limiting"]["default_limit"])
|
||||
@mcp_tool("html")
|
||||
async def generate_html(
|
||||
request: Request,
|
||||
body: HTMLRequest,
|
||||
_td: Dict = Depends(token_dep),
|
||||
):
|
||||
"""
|
||||
Crawls the URL, preprocesses the raw HTML for schema extraction, and returns the processed HTML.
|
||||
Use when you need sanitized HTML structures for building schemas or further processing.
|
||||
"""
|
||||
cfg = CrawlerRunConfig()
|
||||
async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
|
||||
results = await crawler.arun(url=body.url, config=cfg)
|
||||
raw_html = results[0].html
|
||||
from crawl4ai.utils import preprocess_html_for_schema
|
||||
processed_html = preprocess_html_for_schema(raw_html)
|
||||
return JSONResponse({"html": processed_html, "url": body.url, "success": True})
|
||||
|
||||
# Screenshot endpoint
|
||||
|
||||
@app.post("/screenshot")
|
||||
@limiter.limit(config["rate_limiting"]["default_limit"])
|
||||
@mcp_tool("screenshot")
|
||||
async def generate_screenshot(
|
||||
request: Request,
|
||||
body: ScreenshotRequest,
|
||||
_td: Dict = Depends(token_dep),
|
||||
):
|
||||
"""
|
||||
Capture a full-page PNG screenshot of the specified URL, waiting an optional delay before capture,
|
||||
Use when you need an image snapshot of the rendered page. Its recommened to provide an output path to save the screenshot.
|
||||
Then in result instead of the screenshot you will get a path to the saved file.
|
||||
"""
|
||||
cfg = CrawlerRunConfig(screenshot=True, screenshot_wait_for=body.screenshot_wait_for)
|
||||
async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
|
||||
results = await crawler.arun(url=body.url, config=cfg)
|
||||
screenshot_data = results[0].screenshot
|
||||
if body.output_path:
|
||||
abs_path = os.path.abspath(body.output_path)
|
||||
os.makedirs(os.path.dirname(abs_path), exist_ok=True)
|
||||
with open(abs_path, "wb") as f:
|
||||
f.write(base64.b64decode(screenshot_data))
|
||||
return {"success": True, "path": abs_path}
|
||||
return {"success": True, "screenshot": screenshot_data}
|
||||
|
||||
# PDF endpoint
|
||||
|
||||
@app.post("/pdf")
|
||||
@limiter.limit(config["rate_limiting"]["default_limit"])
|
||||
@mcp_tool("pdf")
|
||||
async def generate_pdf(
|
||||
request: Request,
|
||||
body: PDFRequest,
|
||||
_td: Dict = Depends(token_dep),
|
||||
):
|
||||
"""
|
||||
Generate a PDF document of the specified URL,
|
||||
Use when you need a printable or archivable snapshot of the page. It is recommended to provide an output path to save the PDF.
|
||||
Then in result instead of the PDF you will get a path to the saved file.
|
||||
"""
|
||||
cfg = CrawlerRunConfig(pdf=True)
|
||||
async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
|
||||
results = await crawler.arun(url=body.url, config=cfg)
|
||||
pdf_data = results[0].pdf
|
||||
if body.output_path:
|
||||
abs_path = os.path.abspath(body.output_path)
|
||||
os.makedirs(os.path.dirname(abs_path), exist_ok=True)
|
||||
with open(abs_path, "wb") as f:
|
||||
f.write(pdf_data)
|
||||
return {"success": True, "path": abs_path}
|
||||
return {"success": True, "pdf": base64.b64encode(pdf_data).decode()}
|
||||
|
||||
|
||||
@app.post("/execute_js")
|
||||
@limiter.limit(config["rate_limiting"]["default_limit"])
|
||||
@mcp_tool("execute_js")
|
||||
async def execute_js(
|
||||
request: Request,
|
||||
body: JSEndpointRequest,
|
||||
_td: Dict = Depends(token_dep),
|
||||
):
|
||||
"""
|
||||
Execute a sequence of JavaScript snippets on the specified URL.
|
||||
Return the full CrawlResult JSON (first result).
|
||||
Use this when you need to interact with dynamic pages using JS.
|
||||
REMEMBER: Scripts accept a list of separated JS snippets to execute and execute them in order.
|
||||
IMPORTANT: Each script should be an expression that returns a value. It can be an IIFE or an async function. You can think of it as such.
|
||||
Your script will replace '{script}' and execute in the browser context. So provide either an IIFE or a sync/async function that returns a value.
|
||||
Return Format:
|
||||
- The return result is an instance of CrawlResult, so you have access to markdown, links, and other stuff. If this is enough, you don't need to call again for other endpoints.
|
||||
|
||||
```python
|
||||
class CrawlResult(BaseModel):
|
||||
url: str
|
||||
html: str
|
||||
success: bool
|
||||
cleaned_html: Optional[str] = None
|
||||
media: Dict[str, List[Dict]] = {}
|
||||
links: Dict[str, List[Dict]] = {}
|
||||
downloaded_files: Optional[List[str]] = None
|
||||
js_execution_result: Optional[Dict[str, Any]] = None
|
||||
screenshot: Optional[str] = None
|
||||
pdf: Optional[bytes] = None
|
||||
mhtml: Optional[str] = None
|
||||
_markdown: Optional[MarkdownGenerationResult] = PrivateAttr(default=None)
|
||||
extracted_content: Optional[str] = None
|
||||
metadata: Optional[dict] = None
|
||||
error_message: Optional[str] = None
|
||||
session_id: Optional[str] = None
|
||||
response_headers: Optional[dict] = None
|
||||
status_code: Optional[int] = None
|
||||
ssl_certificate: Optional[SSLCertificate] = None
|
||||
dispatch_result: Optional[DispatchResult] = None
|
||||
redirected_url: Optional[str] = None
|
||||
network_requests: Optional[List[Dict[str, Any]]] = None
|
||||
console_messages: Optional[List[Dict[str, Any]]] = None
|
||||
|
||||
class MarkdownGenerationResult(BaseModel):
|
||||
raw_markdown: str
|
||||
markdown_with_citations: str
|
||||
references_markdown: str
|
||||
fit_markdown: Optional[str] = None
|
||||
fit_html: Optional[str] = None
|
||||
```
|
||||
|
||||
"""
|
||||
cfg = CrawlerRunConfig(js_code=body.scripts)
|
||||
async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
|
||||
results = await crawler.arun(url=body.url, config=cfg)
|
||||
# Return JSON-serializable dict of the first CrawlResult
|
||||
data = results[0].model_dump()
|
||||
return JSONResponse(data)
|
||||
|
||||
|
||||
@app.get("/llm/{url:path}")
|
||||
async def llm_endpoint(
|
||||
@ -224,27 +442,35 @@ async def llm_endpoint(
|
||||
answer = await handle_llm_qa(url, q, config)
|
||||
return JSONResponse({"answer": answer})
|
||||
|
||||
|
||||
@app.get("/schema")
|
||||
async def get_schema():
|
||||
from crawl4ai import BrowserConfig, CrawlerRunConfig
|
||||
return {"browser": BrowserConfig().dump(),
|
||||
"crawler": CrawlerRunConfig().dump()}
|
||||
|
||||
|
||||
@app.get(config["observability"]["health_check"]["endpoint"])
|
||||
async def health():
|
||||
return {"status": "ok", "timestamp": time.time(), "version": __version__}
|
||||
|
||||
|
||||
@app.get(config["observability"]["prometheus"]["endpoint"])
|
||||
async def metrics():
|
||||
return RedirectResponse(config["observability"]["prometheus"]["endpoint"])
|
||||
|
||||
|
||||
@app.post("/crawl")
|
||||
@limiter.limit(config["rate_limiting"]["default_limit"])
|
||||
@mcp_tool("crawl")
|
||||
async def crawl(
|
||||
request: Request,
|
||||
crawl_request: CrawlRequest,
|
||||
_td: Dict = Depends(token_dep),
|
||||
):
|
||||
"""
|
||||
Crawl a list of URLs and return the results as JSON.
|
||||
"""
|
||||
if not crawl_request.urls:
|
||||
raise HTTPException(400, "At least one URL required")
|
||||
res = await handle_crawl_request(
|
||||
@ -255,6 +481,7 @@ async def crawl(
|
||||
)
|
||||
return JSONResponse(res)
|
||||
|
||||
|
||||
@app.post("/crawl/stream")
|
||||
@limiter.limit(config["rate_limiting"]["default_limit"])
|
||||
async def crawl_stream(
|
||||
@ -280,6 +507,133 @@ async def crawl_stream(
|
||||
},
|
||||
)
|
||||
|
||||
def chunk_code_functions(code_md: str) -> List[str]:
|
||||
"""Extract each function/class from markdown code blocks per file."""
|
||||
pattern = re.compile(
|
||||
# match "## File: <path>" then a ```py fence, then capture until the closing ```
|
||||
r'##\s*File:\s*(?P<path>.+?)\s*?\r?\n' # file header
|
||||
r'```py\s*?\r?\n' # opening fence
|
||||
r'(?P<code>.*?)(?=\r?\n```)', # code block
|
||||
re.DOTALL
|
||||
)
|
||||
chunks: List[str] = []
|
||||
for m in pattern.finditer(code_md):
|
||||
file_path = m.group("path").strip()
|
||||
code_blk = m.group("code")
|
||||
tree = ast.parse(code_blk)
|
||||
lines = code_blk.splitlines()
|
||||
for node in tree.body:
|
||||
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
|
||||
start = node.lineno - 1
|
||||
end = getattr(node, "end_lineno", start + 1)
|
||||
snippet = "\n".join(lines[start:end])
|
||||
chunks.append(f"# File: {file_path}\n{snippet}")
|
||||
return chunks
|
||||
|
||||
def chunk_doc_sections(doc: str) -> List[str]:
|
||||
lines = doc.splitlines(keepends=True)
|
||||
sections = []
|
||||
current: List[str] = []
|
||||
for line in lines:
|
||||
if re.match(r"^#{1,6}\s", line):
|
||||
if current:
|
||||
sections.append("".join(current))
|
||||
current = [line]
|
||||
else:
|
||||
current.append(line)
|
||||
if current:
|
||||
sections.append("".join(current))
|
||||
return sections
|
||||
|
||||
@app.get("/ask")
|
||||
@limiter.limit(config["rate_limiting"]["default_limit"])
|
||||
@mcp_tool("ask")
|
||||
async def get_context(
|
||||
request: Request,
|
||||
_td: Dict = Depends(token_dep),
|
||||
context_type: str = Query("all", regex="^(code|doc|all)$"),
|
||||
query: Optional[str] = Query(None, description="search query to filter chunks"),
|
||||
score_ratio: float = Query(0.5, ge=0.0, le=1.0, description="min score as fraction of max_score"),
|
||||
max_results: int = Query(20, ge=1, description="absolute cap on returned chunks"),
|
||||
):
|
||||
"""
|
||||
This end point is design for any questions about Crawl4ai library. It returns a plain text markdown with extensive information about Crawl4ai.
|
||||
You can use this as a context for any AI assistant. Use this endpoint for AI assistants to retrieve library context for decision making or code generation tasks.
|
||||
Alway is BEST practice you provide a query to filter the context. Otherwise the lenght of the response will be very long.
|
||||
|
||||
Parameters:
|
||||
- context_type: Specify "code" for code context, "doc" for documentation context, or "all" for both.
|
||||
- query: RECOMMENDED search query to filter paragraphs using BM25. You can leave this empty to get all the context.
|
||||
- score_ratio: Minimum score as a fraction of the maximum score for filtering results.
|
||||
- max_results: Maximum number of results to return. Default is 20.
|
||||
|
||||
Returns:
|
||||
- JSON response with the requested context.
|
||||
- If "code" is specified, returns the code context.
|
||||
- If "doc" is specified, returns the documentation context.
|
||||
- If "all" is specified, returns both code and documentation contexts.
|
||||
"""
|
||||
# load contexts
|
||||
base = os.path.dirname(__file__)
|
||||
code_path = os.path.join(base, "c4ai-code-context.md")
|
||||
doc_path = os.path.join(base, "c4ai-doc-context.md")
|
||||
if not os.path.exists(code_path) or not os.path.exists(doc_path):
|
||||
raise HTTPException(404, "Context files not found")
|
||||
|
||||
with open(code_path, "r") as f:
|
||||
code_content = f.read()
|
||||
with open(doc_path, "r") as f:
|
||||
doc_content = f.read()
|
||||
|
||||
# if no query, just return raw contexts
|
||||
if not query:
|
||||
if context_type == "code":
|
||||
return JSONResponse({"code_context": code_content})
|
||||
if context_type == "doc":
|
||||
return JSONResponse({"doc_context": doc_content})
|
||||
return JSONResponse({
|
||||
"code_context": code_content,
|
||||
"doc_context": doc_content,
|
||||
})
|
||||
|
||||
tokens = query.split()
|
||||
results: Dict[str, List[Dict[str, float]]] = {}
|
||||
|
||||
# code BM25 over functions/classes
|
||||
if context_type in ("code", "all"):
|
||||
code_chunks = chunk_code_functions(code_content)
|
||||
bm25 = BM25Okapi([c.split() for c in code_chunks])
|
||||
scores = bm25.get_scores(tokens)
|
||||
max_sc = float(scores.max()) if scores.size > 0 else 0.0
|
||||
cutoff = max_sc * score_ratio
|
||||
picked = [(c, s) for c, s in zip(code_chunks, scores) if s >= cutoff]
|
||||
picked = sorted(picked, key=lambda x: x[1], reverse=True)[:max_results]
|
||||
results["code_results"] = [{"text": c, "score": s} for c, s in picked]
|
||||
|
||||
# doc BM25 over markdown sections
|
||||
if context_type in ("doc", "all"):
|
||||
sections = chunk_doc_sections(doc_content)
|
||||
bm25d = BM25Okapi([sec.split() for sec in sections])
|
||||
scores_d = bm25d.get_scores(tokens)
|
||||
max_sd = float(scores_d.max()) if scores_d.size > 0 else 0.0
|
||||
cutoff_d = max_sd * score_ratio
|
||||
idxs = [i for i, s in enumerate(scores_d) if s >= cutoff_d]
|
||||
neighbors = set(i for idx in idxs for i in (idx-1, idx, idx+1))
|
||||
valid = [i for i in sorted(neighbors) if 0 <= i < len(sections)]
|
||||
valid = valid[:max_results]
|
||||
results["doc_results"] = [
|
||||
{"text": sections[i], "score": scores_d[i]} for i in valid
|
||||
]
|
||||
|
||||
return JSONResponse(results)
|
||||
|
||||
|
||||
# attach MCP layer (adds /mcp/ws, /mcp/sse, /mcp/schema)
|
||||
attach_mcp(
|
||||
app,
|
||||
base_url=f"http://{config['app']['host']}:{config['app']['port']}"
|
||||
)
|
||||
|
||||
# ────────────────────────── cli ──────────────────────────────
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
|
119
tests/mcp/test_mcp_socket.py
Normal file
119
tests/mcp/test_mcp_socket.py
Normal file
@ -0,0 +1,119 @@
|
||||
# pip install "mcp-sdk[ws]" anyio
|
||||
import anyio, json
|
||||
from mcp.client.websocket import websocket_client
|
||||
from mcp.client.session import ClientSession
|
||||
|
||||
async def test_list():
|
||||
async with websocket_client("ws://localhost:8020/mcp/ws") as (r, w):
|
||||
async with ClientSession(r, w) as s:
|
||||
await s.initialize()
|
||||
|
||||
print("tools :", [t.name for t in (await s.list_tools()).tools])
|
||||
print("resources :", [r.name for r in (await s.list_resources()).resources])
|
||||
print("templates :", [t.name for t in (await s.list_resource_templates()).resource_templates])
|
||||
|
||||
|
||||
async def test_crawl(s: ClientSession) -> None:
|
||||
"""Hit the @mcp_tool('crawl') endpoint."""
|
||||
res = await s.call_tool(
|
||||
"crawl",
|
||||
{
|
||||
"urls": ["https://example.com"],
|
||||
"browser_config": {},
|
||||
"crawler_config": {},
|
||||
},
|
||||
)
|
||||
print("crawl →", json.loads(res.content[0].text))
|
||||
|
||||
|
||||
async def test_md(s: ClientSession) -> None:
|
||||
"""Hit the @mcp_tool('md') endpoint."""
|
||||
res = await s.call_tool(
|
||||
"md",
|
||||
{
|
||||
"url": "https://example.com",
|
||||
"f": "fit", # or RAW, BM25, LLM
|
||||
"q": None,
|
||||
"c": "0",
|
||||
},
|
||||
)
|
||||
result = json.loads(res.content[0].text)
|
||||
print("md →", result['markdown'][:100], "...")
|
||||
|
||||
async def test_screenshot(s: ClientSession):
|
||||
res = await s.call_tool(
|
||||
"screenshot",
|
||||
{
|
||||
"url": "https://example.com",
|
||||
"screenshot_wait_for": 1.0,
|
||||
},
|
||||
)
|
||||
png_b64 = json.loads(res.content[0].text)["screenshot"]
|
||||
print("screenshot →", png_b64[:60], "… (base64)")
|
||||
|
||||
|
||||
async def test_pdf(s: ClientSession):
|
||||
res = await s.call_tool(
|
||||
"pdf",
|
||||
{
|
||||
"url": "https://example.com",
|
||||
},
|
||||
)
|
||||
pdf_b64 = json.loads(res.content[0].text)["pdf"]
|
||||
print("pdf →", pdf_b64[:60], "… (base64)")
|
||||
|
||||
async def test_execute_js(s: ClientSession):
|
||||
# click the “More” link on Hacker News front page and wait 1 s
|
||||
res = await s.call_tool(
|
||||
"execute_js",
|
||||
{
|
||||
"url": "https://news.ycombinator.com/news",
|
||||
"js_code": [
|
||||
"await page.click('a.morelink')",
|
||||
"await page.waitForTimeout(1000)",
|
||||
],
|
||||
},
|
||||
)
|
||||
crawl_result = json.loads(res.content[0].text)
|
||||
print("execute_js → status", crawl_result["success"], "| html len:", len(crawl_result["html"]))
|
||||
|
||||
async def test_html(s: ClientSession):
|
||||
# click the “More” link on Hacker News front page and wait 1 s
|
||||
res = await s.call_tool(
|
||||
"html",
|
||||
{
|
||||
"url": "https://news.ycombinator.com/news",
|
||||
},
|
||||
)
|
||||
crawl_result = json.loads(res.content[0].text)
|
||||
print("execute_js → status", crawl_result["success"], "| html len:", len(crawl_result["html"]))
|
||||
|
||||
async def test_context(s: ClientSession):
|
||||
# click the “More” link on Hacker News front page and wait 1 s
|
||||
res = await s.call_tool(
|
||||
"ask",
|
||||
{
|
||||
"query": "I hv a question about Crawl4ai library, how to extract internal links when crawling a page?"
|
||||
},
|
||||
)
|
||||
crawl_result = json.loads(res.content[0].text)
|
||||
print("execute_js → status", crawl_result["success"], "| html len:", len(crawl_result["html"]))
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
async with websocket_client("ws://localhost:8020/mcp/ws") as (r, w):
|
||||
async with ClientSession(r, w) as s:
|
||||
await s.initialize() # handshake
|
||||
tools = (await s.list_tools()).tools
|
||||
print("tools:", [t.name for t in tools])
|
||||
|
||||
# await test_list()
|
||||
# await test_crawl(s)
|
||||
# await test_md(s)
|
||||
# await test_screenshot(s)
|
||||
# await test_pdf(s)
|
||||
# await test_execute_js(s)
|
||||
# await test_html(s)
|
||||
await test_context(s)
|
||||
|
||||
anyio.run(main)
|
11
tests/mcp/test_mcp_sse.py
Normal file
11
tests/mcp/test_mcp_sse.py
Normal file
@ -0,0 +1,11 @@
|
||||
from mcp.client.sse import sse_client
|
||||
from mcp.client.session import ClientSession
|
||||
|
||||
async def main():
|
||||
async with sse_client("http://127.0.0.1:8020/mcp") as (r, w):
|
||||
async with ClientSession(r, w) as sess:
|
||||
print(await sess.list_tools()) # now works
|
||||
|
||||
if __name__ == "__main__":
|
||||
import asyncio
|
||||
asyncio.run(main())
|
@ -11,7 +11,8 @@ If the server isn’t running, start it first:
|
||||
|
||||
import sys, json, textwrap, requests
|
||||
|
||||
BASE = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:8020"
|
||||
# BASE = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:8020"
|
||||
BASE = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:11235"
|
||||
URL = f"{BASE.rstrip('/')}/config/dump"
|
||||
|
||||
CASES = [
|
Loading…
x
Reference in New Issue
Block a user