| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | import logging | 
					
						
							|  |  |  | import time | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | import click | 
					
						
							| 
									
										
										
										
											2024-12-24 18:38:51 +08:00
										 |  |  | from celery import shared_task  # type: ignore | 
					
						
							| 
									
										
										
										
											2024-02-06 13:21:13 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-11 16:40:52 +08:00
										 |  |  | from core.indexing_runner import DocumentIsPausedError, IndexingRunner | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | from extensions.ext_database import db | 
					
						
							|  |  |  | from models.dataset import Document | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  | @shared_task(queue="dataset") | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | def recover_document_indexing_task(dataset_id: str, document_id: str): | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     Async recover document | 
					
						
							|  |  |  |     :param dataset_id: | 
					
						
							|  |  |  |     :param document_id: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     Usage: recover_document_indexing_task.delay(dataset_id, document_id) | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |     logging.info(click.style("Recover document: {}".format(document_id), fg="green")) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     start_at = time.perf_counter() | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |     document = db.session.query(Document).filter(Document.id == document_id, Document.dataset_id == dataset_id).first() | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     if not document: | 
					
						
							| 
									
										
										
										
											2025-04-07 20:31:26 +08:00
										 |  |  |         logging.info(click.style("Document not found: {}".format(document_id), fg="red")) | 
					
						
							|  |  |  |         db.session.close() | 
					
						
							|  |  |  |         return | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     try: | 
					
						
							|  |  |  |         indexing_runner = IndexingRunner() | 
					
						
							| 
									
										
										
										
											2024-09-13 22:42:08 +08:00
										 |  |  |         if document.indexing_status in {"waiting", "parsing", "cleaning"}: | 
					
						
							| 
									
										
										
										
											2023-06-16 21:47:51 +08:00
										 |  |  |             indexing_runner.run([document]) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |         elif document.indexing_status == "splitting": | 
					
						
							|  |  |  |             indexing_runner.run_in_splitting_status(document) | 
					
						
							|  |  |  |         elif document.indexing_status == "indexing": | 
					
						
							|  |  |  |             indexing_runner.run_in_indexing_status(document) | 
					
						
							|  |  |  |         end_at = time.perf_counter() | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |         logging.info( | 
					
						
							|  |  |  |             click.style("Processed document: {} latency: {}".format(document.id, end_at - start_at), fg="green") | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2024-09-11 16:40:52 +08:00
										 |  |  |     except DocumentIsPausedError as ex: | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |         logging.info(click.style(str(ex), fg="yellow")) | 
					
						
							| 
									
										
										
										
											2023-06-25 16:49:14 +08:00
										 |  |  |     except Exception: | 
					
						
							| 
									
										
										
										
											2025-05-30 14:42:47 +08:00
										 |  |  |         logging.exception("recover_document_indexing_task failed, document_id: {}".format(document_id)) | 
					
						
							| 
									
										
										
										
											2025-04-07 20:31:26 +08:00
										 |  |  |     finally: | 
					
						
							|  |  |  |         db.session.close() |