| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | import datetime | 
					
						
							|  |  |  | import logging | 
					
						
							|  |  |  | import time | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | import click | 
					
						
							|  |  |  | from celery import shared_task | 
					
						
							| 
									
										
										
										
											2024-01-12 12:34:01 +08:00
										 |  |  | from core.indexing_runner import DocumentIsPausedException, IndexingRunner | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | from extensions.ext_database import db | 
					
						
							|  |  |  | from models.dataset import Document | 
					
						
							| 
									
										
										
										
											2024-01-12 12:34:01 +08:00
										 |  |  | from werkzeug.exceptions import NotFound | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-07-31 13:13:08 +08:00
										 |  |  | @shared_task(queue='dataset') | 
					
						
							| 
									
										
										
										
											2023-06-16 21:47:51 +08:00
										 |  |  | def document_indexing_task(dataset_id: str, document_ids: list): | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     """
 | 
					
						
							|  |  |  |     Async process document | 
					
						
							|  |  |  |     :param dataset_id: | 
					
						
							| 
									
										
										
										
											2023-06-16 21:47:51 +08:00
										 |  |  |     :param document_ids: | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     Usage: document_indexing_task.delay(dataset_id, document_id) | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2023-06-16 21:47:51 +08:00
										 |  |  |     documents = [] | 
					
						
							| 
									
										
										
										
											2023-06-25 16:49:14 +08:00
										 |  |  |     start_at = time.perf_counter() | 
					
						
							| 
									
										
										
										
											2023-06-16 21:47:51 +08:00
										 |  |  |     for document_id in document_ids: | 
					
						
							|  |  |  |         logging.info(click.style('Start process document: {}'.format(document_id), fg='green')) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         document = db.session.query(Document).filter( | 
					
						
							|  |  |  |             Document.id == document_id, | 
					
						
							|  |  |  |             Document.dataset_id == dataset_id | 
					
						
							|  |  |  |         ).first() | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-10-12 13:30:44 +08:00
										 |  |  |         if document: | 
					
						
							|  |  |  |             document.indexing_status = 'parsing' | 
					
						
							|  |  |  |             document.processing_started_at = datetime.datetime.utcnow() | 
					
						
							|  |  |  |             documents.append(document) | 
					
						
							|  |  |  |             db.session.add(document) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     db.session.commit() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     try: | 
					
						
							|  |  |  |         indexing_runner = IndexingRunner() | 
					
						
							| 
									
										
										
										
											2023-06-16 21:47:51 +08:00
										 |  |  |         indexing_runner.run(documents) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |         end_at = time.perf_counter() | 
					
						
							| 
									
										
										
										
											2023-06-25 16:49:14 +08:00
										 |  |  |         logging.info(click.style('Processed dataset: {} latency: {}'.format(dataset_id, end_at - start_at), fg='green')) | 
					
						
							|  |  |  |     except DocumentIsPausedException as ex: | 
					
						
							|  |  |  |         logging.info(click.style(str(ex), fg='yellow')) | 
					
						
							|  |  |  |     except Exception: | 
					
						
							|  |  |  |         pass |