feat(crawl_sync): add synchronous crawl endpoint and corresponding test

This commit is contained in:
UncleCode 2024-11-16 15:34:30 +08:00
parent 5098442086
commit 90df6921b7
2 changed files with 45 additions and 0 deletions

View File

@ -33,6 +33,13 @@ class Crawl4AiTester:
return status
time.sleep(2)
def submit_sync(self, request_data: Dict[str, Any]) -> Dict[str, Any]:
response = requests.post(f"{self.base_url}/crawl_sync", json=request_data, timeout=60)
if response.status_code == 408:
raise TimeoutError("Task did not complete within server timeout")
response.raise_for_status()
return response.json()
def test_docker_deployment(version="basic"):
tester = Crawl4AiTester()
@ -54,6 +61,7 @@ def test_docker_deployment(version="basic"):
# Test cases based on version
test_basic_crawl(tester)
test_basic_crawl_sync(tester)
# if version in ["full", "transformer"]:
# test_cosine_extraction(tester)
@ -78,6 +86,19 @@ def test_basic_crawl(tester: Crawl4AiTester):
assert result["result"]["success"]
assert len(result["result"]["markdown"]) > 0
def test_basic_crawl_sync(tester: Crawl4AiTester):
print("\n=== Testing Basic Crawl (Sync) ===")
request = {
"urls": "https://www.nbcnews.com/business",
"priority": 10
}
result = tester.submit_sync(request)
print(f"Basic crawl result length: {len(result['result']['markdown'])}")
assert result['status'] == 'completed'
assert result['result']['success']
assert len(result['result']['markdown']) > 0
def test_js_execution(tester: Crawl4AiTester):
print("\n=== Testing JS Execution ===")
request = {

24
main.py
View File

@ -375,6 +375,30 @@ async def get_task_status(task_id: str):
return response
@app.post("/crawl_sync")
async def crawl_sync(request: CrawlRequest) -> Dict[str, Any]:
task_id = await crawler_service.submit_task(request)
# Wait up to 60 seconds for task completion
for _ in range(60):
task_info = crawler_service.task_manager.get_task(task_id)
if not task_info:
raise HTTPException(status_code=404, detail="Task not found")
if task_info.status == TaskStatus.COMPLETED:
# Return same format as /task/{task_id} endpoint
if isinstance(task_info.result, list):
return {"status": task_info.status, "results": [result.dict() for result in task_info.result]}
return {"status": task_info.status, "result": task_info.result.dict()}
if task_info.status == TaskStatus.FAILED:
raise HTTPException(status_code=500, detail=task_info.error)
await asyncio.sleep(1)
# If we get here, task didn't complete within timeout
raise HTTPException(status_code=408, detail="Task timed out")
@app.get("/health")
async def health_check():
available_slots = await crawler_service.resource_monitor.get_available_slots()