| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | import logging | 
					
						
							|  |  |  | import time | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | import click | 
					
						
							|  |  |  | from celery import shared_task | 
					
						
							| 
									
										
										
										
											2024-02-06 13:21:13 +08:00
										 |  |  | from werkzeug.exceptions import NotFound | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-22 23:31:57 +08:00
										 |  |  | from core.rag.index_processor.index_processor_factory import IndexProcessorFactory | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | from extensions.ext_database import db | 
					
						
							|  |  |  | from extensions.ext_redis import redis_client | 
					
						
							| 
									
										
										
										
											2024-01-12 12:34:01 +08:00
										 |  |  | from models.dataset import Document, DocumentSegment | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-07-31 13:13:08 +08:00
										 |  |  | @shared_task(queue='dataset') | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | def remove_document_from_index_task(document_id: str): | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     Async Remove document from index | 
					
						
							|  |  |  |     :param document_id: document id | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     Usage: remove_document_from_index.delay(document_id) | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     logging.info(click.style('Start remove document segments from index: {}'.format(document_id), fg='green')) | 
					
						
							|  |  |  |     start_at = time.perf_counter() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     document = db.session.query(Document).filter(Document.id == document_id).first() | 
					
						
							|  |  |  |     if not document: | 
					
						
							|  |  |  |         raise NotFound('Document not found') | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if document.indexing_status != 'completed': | 
					
						
							|  |  |  |         return | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     indexing_cache_key = 'document_{}_indexing'.format(document.id) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     try: | 
					
						
							|  |  |  |         dataset = document.dataset | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         if not dataset: | 
					
						
							|  |  |  |             raise Exception('Document has no dataset') | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-22 23:31:57 +08:00
										 |  |  |         index_processor = IndexProcessorFactory(document.doc_form).init_index_processor() | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |         segments = db.session.query(DocumentSegment).filter(DocumentSegment.document_id == document.id).all() | 
					
						
							|  |  |  |         index_node_ids = [segment.index_node_id for segment in segments] | 
					
						
							|  |  |  |         if index_node_ids: | 
					
						
							| 
									
										
										
										
											2024-02-23 12:30:39 +08:00
										 |  |  |             try: | 
					
						
							|  |  |  |                 index_processor.clean(dataset, index_node_ids) | 
					
						
							|  |  |  |             except Exception: | 
					
						
							|  |  |  |                 logging.exception(f"clean dataset {dataset.id} from index failed") | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |         end_at = time.perf_counter() | 
					
						
							|  |  |  |         logging.info( | 
					
						
							|  |  |  |             click.style('Document removed from index: {} latency: {}'.format(document.id, end_at - start_at), fg='green')) | 
					
						
							|  |  |  |     except Exception: | 
					
						
							|  |  |  |         logging.exception("remove document from index failed") | 
					
						
							|  |  |  |         if not document.archived: | 
					
						
							|  |  |  |             document.enabled = True | 
					
						
							|  |  |  |             db.session.commit() | 
					
						
							|  |  |  |     finally: | 
					
						
							|  |  |  |         redis_client.delete(indexing_cache_key) |