2025-02-01 14:28:28 +08:00
|
|
|
import os
|
|
|
|
import sys
|
2025-01-31 18:00:16 +08:00
|
|
|
sys.path.append(os.path.dirname(os.path.realpath(__file__)))
|
2025-01-31 15:22:21 +08:00
|
|
|
from fastapi import FastAPI, HTTPException
|
|
|
|
from fastapi.responses import StreamingResponse
|
|
|
|
import json
|
|
|
|
import asyncio
|
|
|
|
from typing import AsyncGenerator
|
|
|
|
from crawl4ai import (
|
|
|
|
BrowserConfig,
|
|
|
|
CrawlerRunConfig,
|
|
|
|
AsyncWebCrawler,
|
|
|
|
MemoryAdaptiveDispatcher,
|
|
|
|
RateLimiter,
|
|
|
|
)
|
2025-02-01 20:10:13 +08:00
|
|
|
|
|
|
|
from typing import List, Optional
|
|
|
|
from pydantic import BaseModel
|
|
|
|
|
|
|
|
class CrawlRequest(BaseModel):
|
|
|
|
urls: List[str]
|
|
|
|
browser_config: Optional[dict] = None
|
|
|
|
crawler_config: Optional[dict] = None
|
|
|
|
|
|
|
|
class CrawlResponse(BaseModel):
|
|
|
|
success: bool
|
|
|
|
results: List[dict]
|
|
|
|
|
|
|
|
class Config:
|
|
|
|
arbitrary_types_allowed = True
|
2025-01-31 15:22:21 +08:00
|
|
|
|
|
|
|
app = FastAPI(title="Crawl4AI API")
|
|
|
|
|
|
|
|
async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator) -> AsyncGenerator[bytes, None]:
|
|
|
|
"""Stream results and manage crawler lifecycle"""
|
2025-01-31 18:00:16 +08:00
|
|
|
def datetime_handler(obj):
|
|
|
|
"""Custom handler for datetime objects during JSON serialization"""
|
|
|
|
if hasattr(obj, 'isoformat'):
|
|
|
|
return obj.isoformat()
|
|
|
|
raise TypeError(f"Object of type {type(obj)} is not JSON serializable")
|
|
|
|
|
2025-01-31 15:22:21 +08:00
|
|
|
try:
|
|
|
|
async for result in results_gen:
|
|
|
|
try:
|
2025-01-31 18:00:16 +08:00
|
|
|
# Use dump method for serialization
|
|
|
|
result_dict = result.model_dump()
|
2025-01-31 15:22:21 +08:00
|
|
|
print(f"Streaming result for URL: {result_dict['url']}, Success: {result_dict['success']}")
|
2025-01-31 18:00:16 +08:00
|
|
|
# Use custom JSON encoder with datetime handler
|
|
|
|
yield (json.dumps(result_dict, default=datetime_handler) + "\n").encode('utf-8')
|
2025-01-31 15:22:21 +08:00
|
|
|
except Exception as e:
|
|
|
|
print(f"Error serializing result: {e}")
|
|
|
|
error_response = {
|
|
|
|
"error": str(e),
|
|
|
|
"url": getattr(result, 'url', 'unknown')
|
|
|
|
}
|
2025-01-31 18:00:16 +08:00
|
|
|
yield (json.dumps(error_response, default=datetime_handler) + "\n").encode('utf-8')
|
2025-01-31 15:22:21 +08:00
|
|
|
except asyncio.CancelledError:
|
|
|
|
print("Client disconnected, cleaning up...")
|
|
|
|
finally:
|
|
|
|
try:
|
|
|
|
await crawler.close()
|
|
|
|
except Exception as e:
|
|
|
|
print(f"Error closing crawler: {e}")
|
|
|
|
|
|
|
|
@app.post("/crawl")
|
|
|
|
async def crawl(request: CrawlRequest):
|
2025-01-31 18:00:16 +08:00
|
|
|
# Load configs using our new utilities
|
|
|
|
browser_config = BrowserConfig.load(request.browser_config)
|
|
|
|
crawler_config = CrawlerRunConfig.load(request.crawler_config)
|
2025-01-31 15:22:21 +08:00
|
|
|
|
|
|
|
dispatcher = MemoryAdaptiveDispatcher(
|
2025-01-31 18:00:16 +08:00
|
|
|
memory_threshold_percent=95.0,
|
2025-01-31 15:22:21 +08:00
|
|
|
rate_limiter=RateLimiter(base_delay=(1.0, 2.0)),
|
|
|
|
)
|
|
|
|
|
|
|
|
try:
|
|
|
|
if crawler_config.stream:
|
|
|
|
crawler = AsyncWebCrawler(config=browser_config)
|
|
|
|
await crawler.start()
|
|
|
|
|
|
|
|
results_gen = await crawler.arun_many(
|
|
|
|
urls=request.urls,
|
|
|
|
config=crawler_config,
|
|
|
|
dispatcher=dispatcher
|
|
|
|
)
|
|
|
|
|
|
|
|
return StreamingResponse(
|
|
|
|
stream_results(crawler, results_gen),
|
|
|
|
media_type='application/x-ndjson'
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
|
|
results = await crawler.arun_many(
|
|
|
|
urls=request.urls,
|
|
|
|
config=crawler_config,
|
|
|
|
dispatcher=dispatcher
|
|
|
|
)
|
2025-01-31 18:00:16 +08:00
|
|
|
# Use dump method for each result
|
|
|
|
results_dict = [result.model_dump() for result in results]
|
2025-01-31 15:22:21 +08:00
|
|
|
return CrawlResponse(success=True, results=results_dict)
|
|
|
|
except Exception as e:
|
|
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|
|
|
|
@app.get("/schema")
|
|
|
|
async def get_schema():
|
|
|
|
"""Return config schemas for client validation"""
|
|
|
|
return {
|
|
|
|
"browser": BrowserConfig.model_json_schema(),
|
|
|
|
"crawler": CrawlerRunConfig.model_json_schema()
|
|
|
|
}
|
|
|
|
|
2025-01-31 18:00:16 +08:00
|
|
|
@app.get("/health")
|
|
|
|
async def health():
|
|
|
|
return {"status": "ok"}
|
|
|
|
|
|
|
|
|
2025-01-31 15:22:21 +08:00
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
import uvicorn
|
|
|
|
uvicorn.run("server:app", host="0.0.0.0", port=8000, reload=True)
|