| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | import logging | 
					
						
							|  |  |  | import time | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | import click | 
					
						
							|  |  |  | from celery import shared_task | 
					
						
							| 
									
										
										
										
											2024-02-06 13:21:13 +08:00
										 |  |  | from werkzeug.exceptions import NotFound | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-01-12 12:34:01 +08:00
										 |  |  | from core.indexing_runner import DocumentIsPausedException, IndexingRunner | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | from extensions.ext_database import db | 
					
						
							|  |  |  | from models.dataset import Document | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-07-31 13:13:08 +08:00
										 |  |  | @shared_task(queue='dataset') | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | def recover_document_indexing_task(dataset_id: str, document_id: str): | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     Async recover document | 
					
						
							|  |  |  |     :param dataset_id: | 
					
						
							|  |  |  |     :param document_id: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     Usage: recover_document_indexing_task.delay(dataset_id, document_id) | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     logging.info(click.style('Recover document: {}'.format(document_id), fg='green')) | 
					
						
							|  |  |  |     start_at = time.perf_counter() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     document = db.session.query(Document).filter( | 
					
						
							|  |  |  |         Document.id == document_id, | 
					
						
							|  |  |  |         Document.dataset_id == dataset_id | 
					
						
							|  |  |  |     ).first() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if not document: | 
					
						
							|  |  |  |         raise NotFound('Document not found') | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     try: | 
					
						
							|  |  |  |         indexing_runner = IndexingRunner() | 
					
						
							|  |  |  |         if document.indexing_status in ["waiting", "parsing", "cleaning"]: | 
					
						
							| 
									
										
										
										
											2023-06-16 21:47:51 +08:00
										 |  |  |             indexing_runner.run([document]) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |         elif document.indexing_status == "splitting": | 
					
						
							|  |  |  |             indexing_runner.run_in_splitting_status(document) | 
					
						
							|  |  |  |         elif document.indexing_status == "indexing": | 
					
						
							|  |  |  |             indexing_runner.run_in_indexing_status(document) | 
					
						
							|  |  |  |         end_at = time.perf_counter() | 
					
						
							|  |  |  |         logging.info(click.style('Processed document: {} latency: {}'.format(document.id, end_at - start_at), fg='green')) | 
					
						
							| 
									
										
										
										
											2023-06-25 16:49:14 +08:00
										 |  |  |     except DocumentIsPausedException as ex: | 
					
						
							|  |  |  |         logging.info(click.style(str(ex), fg='yellow')) | 
					
						
							|  |  |  |     except Exception: | 
					
						
							|  |  |  |         pass |