| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | import logging | 
					
						
							|  |  |  | import time | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | import click | 
					
						
							|  |  |  | from celery import shared_task | 
					
						
							| 
									
										
										
										
											2024-02-06 13:21:13 +08:00
										 |  |  | from werkzeug.exceptions import NotFound | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-22 23:31:57 +08:00
										 |  |  | from core.rag.index_processor.index_processor_factory import IndexProcessorFactory | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | from extensions.ext_database import db | 
					
						
							|  |  |  | from extensions.ext_redis import redis_client | 
					
						
							| 
									
										
										
										
											2024-01-12 12:34:01 +08:00
										 |  |  | from models.dataset import Document, DocumentSegment | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  | @shared_task(queue="dataset") | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | def remove_document_from_index_task(document_id: str): | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     Async Remove document from index | 
					
						
							|  |  |  |     :param document_id: document id | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     Usage: remove_document_from_index.delay(document_id) | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |     logging.info(click.style("Start remove document segments from index: {}".format(document_id), fg="green")) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     start_at = time.perf_counter() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     document = db.session.query(Document).filter(Document.id == document_id).first() | 
					
						
							|  |  |  |     if not document: | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |         raise NotFound("Document not found") | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |     if document.indexing_status != "completed": | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |         return | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |     indexing_cache_key = "document_{}_indexing".format(document.id) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     try: | 
					
						
							|  |  |  |         dataset = document.dataset | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         if not dataset: | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |             raise Exception("Document has no dataset") | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-22 23:31:57 +08:00
										 |  |  |         index_processor = IndexProcessorFactory(document.doc_form).init_index_processor() | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |         segments = db.session.query(DocumentSegment).filter(DocumentSegment.document_id == document.id).all() | 
					
						
							|  |  |  |         index_node_ids = [segment.index_node_id for segment in segments] | 
					
						
							|  |  |  |         if index_node_ids: | 
					
						
							| 
									
										
										
										
											2024-02-23 12:30:39 +08:00
										 |  |  |             try: | 
					
						
							|  |  |  |                 index_processor.clean(dataset, index_node_ids) | 
					
						
							|  |  |  |             except Exception: | 
					
						
							|  |  |  |                 logging.exception(f"clean dataset {dataset.id} from index failed") | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |         end_at = time.perf_counter() | 
					
						
							|  |  |  |         logging.info( | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |             click.style( | 
					
						
							|  |  |  |                 "Document removed from index: {} latency: {}".format(document.id, end_at - start_at), fg="green" | 
					
						
							|  |  |  |             ) | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     except Exception: | 
					
						
							|  |  |  |         logging.exception("remove document from index failed") | 
					
						
							|  |  |  |         if not document.archived: | 
					
						
							|  |  |  |             document.enabled = True | 
					
						
							|  |  |  |             db.session.commit() | 
					
						
							|  |  |  |     finally: | 
					
						
							|  |  |  |         redis_client.delete(indexing_cache_key) |