From 90df6921b7be573d95795907fcdebd28002dfd9b Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sat, 16 Nov 2024 15:34:30 +0800 Subject: [PATCH] feat(crawl_sync): add synchronous crawl endpoint and corresponding test --- docs/examples/docker_example.py | 21 +++++++++++++++++++++ main.py | 24 ++++++++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/docs/examples/docker_example.py b/docs/examples/docker_example.py index c22acd5..502f1e5 100644 --- a/docs/examples/docker_example.py +++ b/docs/examples/docker_example.py @@ -33,6 +33,13 @@ class Crawl4AiTester: return status time.sleep(2) + + def submit_sync(self, request_data: Dict[str, Any]) -> Dict[str, Any]: + response = requests.post(f"{self.base_url}/crawl_sync", json=request_data, timeout=60) + if response.status_code == 408: + raise TimeoutError("Task did not complete within server timeout") + response.raise_for_status() + return response.json() def test_docker_deployment(version="basic"): tester = Crawl4AiTester() @@ -54,6 +61,7 @@ def test_docker_deployment(version="basic"): # Test cases based on version test_basic_crawl(tester) + test_basic_crawl_sync(tester) # if version in ["full", "transformer"]: # test_cosine_extraction(tester) @@ -78,6 +86,19 @@ def test_basic_crawl(tester: Crawl4AiTester): assert result["result"]["success"] assert len(result["result"]["markdown"]) > 0 +def test_basic_crawl_sync(tester: Crawl4AiTester): + print("\n=== Testing Basic Crawl (Sync) ===") + request = { + "urls": "https://www.nbcnews.com/business", + "priority": 10 + } + + result = tester.submit_sync(request) + print(f"Basic crawl result length: {len(result['result']['markdown'])}") + assert result['status'] == 'completed' + assert result['result']['success'] + assert len(result['result']['markdown']) > 0 + def test_js_execution(tester: Crawl4AiTester): print("\n=== Testing JS Execution ===") request = { diff --git a/main.py b/main.py index a5da029..660c336 100644 --- a/main.py +++ b/main.py @@ -375,6 +375,30 @@ async def get_task_status(task_id: str): return response +@app.post("/crawl_sync") +async def crawl_sync(request: CrawlRequest) -> Dict[str, Any]: + task_id = await crawler_service.submit_task(request) + + # Wait up to 60 seconds for task completion + for _ in range(60): + task_info = crawler_service.task_manager.get_task(task_id) + if not task_info: + raise HTTPException(status_code=404, detail="Task not found") + + if task_info.status == TaskStatus.COMPLETED: + # Return same format as /task/{task_id} endpoint + if isinstance(task_info.result, list): + return {"status": task_info.status, "results": [result.dict() for result in task_info.result]} + return {"status": task_info.status, "result": task_info.result.dict()} + + if task_info.status == TaskStatus.FAILED: + raise HTTPException(status_code=500, detail=task_info.error) + + await asyncio.sleep(1) + + # If we get here, task didn't complete within timeout + raise HTTPException(status_code=408, detail="Task timed out") + @app.get("/health") async def health_check(): available_slots = await crawler_service.resource_monitor.get_available_slots()