| 
									
										
										
										
											2024-12-24 18:38:51 +08:00
										 |  |  | from flask_restful import Resource, reqparse  # type: ignore | 
					
						
							| 
									
										
										
										
											2024-06-15 02:46:02 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | from controllers.console import api | 
					
						
							|  |  |  | from controllers.console.datasets.error import WebsiteCrawlError | 
					
						
							| 
									
										
										
										
											2024-11-01 15:51:22 +08:00
										 |  |  | from controllers.console.wraps import account_initialization_required, setup_required | 
					
						
							| 
									
										
										
										
											2024-06-15 02:46:02 +08:00
										 |  |  | from libs.login import login_required | 
					
						
							|  |  |  | from services.website_service import WebsiteService | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class WebsiteCrawlApi(Resource): | 
					
						
							|  |  |  |     @setup_required | 
					
						
							|  |  |  |     @login_required | 
					
						
							|  |  |  |     @account_initialization_required | 
					
						
							|  |  |  |     def post(self): | 
					
						
							|  |  |  |         parser = reqparse.RequestParser() | 
					
						
							| 
									
										
										
										
											2024-09-30 09:57:19 +08:00
										 |  |  |         parser.add_argument( | 
					
						
							|  |  |  |             "provider", type=str, choices=["firecrawl", "jinareader"], required=True, nullable=True, location="json" | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2024-08-26 15:29:10 +08:00
										 |  |  |         parser.add_argument("url", type=str, required=True, nullable=True, location="json") | 
					
						
							|  |  |  |         parser.add_argument("options", type=dict, required=True, nullable=True, location="json") | 
					
						
							| 
									
										
										
										
											2024-06-15 02:46:02 +08:00
										 |  |  |         args = parser.parse_args() | 
					
						
							|  |  |  |         WebsiteService.document_create_args_validate(args) | 
					
						
							|  |  |  |         # crawl url | 
					
						
							|  |  |  |         try: | 
					
						
							|  |  |  |             result = WebsiteService.crawl_url(args) | 
					
						
							|  |  |  |         except Exception as e: | 
					
						
							|  |  |  |             raise WebsiteCrawlError(str(e)) | 
					
						
							|  |  |  |         return result, 200 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class WebsiteCrawlStatusApi(Resource): | 
					
						
							|  |  |  |     @setup_required | 
					
						
							|  |  |  |     @login_required | 
					
						
							|  |  |  |     @account_initialization_required | 
					
						
							|  |  |  |     def get(self, job_id: str): | 
					
						
							|  |  |  |         parser = reqparse.RequestParser() | 
					
						
							| 
									
										
										
										
											2024-09-30 09:57:19 +08:00
										 |  |  |         parser.add_argument("provider", type=str, choices=["firecrawl", "jinareader"], required=True, location="args") | 
					
						
							| 
									
										
										
										
											2024-06-15 02:46:02 +08:00
										 |  |  |         args = parser.parse_args() | 
					
						
							|  |  |  |         # get crawl status | 
					
						
							|  |  |  |         try: | 
					
						
							| 
									
										
										
										
											2024-08-26 15:29:10 +08:00
										 |  |  |             result = WebsiteService.get_crawl_status(job_id, args["provider"]) | 
					
						
							| 
									
										
										
										
											2024-06-15 02:46:02 +08:00
										 |  |  |         except Exception as e: | 
					
						
							|  |  |  |             raise WebsiteCrawlError(str(e)) | 
					
						
							|  |  |  |         return result, 200 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-08-26 15:29:10 +08:00
										 |  |  | api.add_resource(WebsiteCrawlApi, "/website/crawl") | 
					
						
							|  |  |  | api.add_resource(WebsiteCrawlStatusApi, "/website/crawl/status/<string:job_id>") |