| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | import datetime | 
					
						
							|  |  |  | import logging | 
					
						
							|  |  |  | import time | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | import click | 
					
						
							| 
									
										
										
										
											2024-12-24 18:38:51 +08:00
										 |  |  | from celery import shared_task  # type: ignore | 
					
						
							| 
									
										
										
										
											2024-02-06 13:21:13 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-25 19:49:07 +08:00
										 |  |  | from core.rag.index_processor.constant.index_type import IndexType | 
					
						
							| 
									
										
										
										
											2024-02-22 23:31:57 +08:00
										 |  |  | from core.rag.index_processor.index_processor_factory import IndexProcessorFactory | 
					
						
							| 
									
										
										
										
											2024-12-25 19:49:07 +08:00
										 |  |  | from core.rag.models.document import ChildDocument, Document | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | from extensions.ext_database import db | 
					
						
							|  |  |  | from extensions.ext_redis import redis_client | 
					
						
							|  |  |  | from models.dataset import DocumentSegment | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  | @shared_task(queue="dataset") | 
					
						
							| 
									
										
										
										
											2023-07-28 20:47:15 +08:00
										 |  |  | def enable_segment_to_index_task(segment_id: str): | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2023-07-28 20:47:15 +08:00
										 |  |  |     Async enable segment to index | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     :param segment_id: | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-07-28 20:47:15 +08:00
										 |  |  |     Usage: enable_segment_to_index_task.delay(segment_id) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |     logging.info(click.style("Start enable segment to index: {}".format(segment_id), fg="green")) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     start_at = time.perf_counter() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     segment = db.session.query(DocumentSegment).filter(DocumentSegment.id == segment_id).first() | 
					
						
							|  |  |  |     if not segment: | 
					
						
							| 
									
										
										
										
											2025-04-07 20:31:26 +08:00
										 |  |  |         logging.info(click.style("Segment not found: {}".format(segment_id), fg="red")) | 
					
						
							|  |  |  |         db.session.close() | 
					
						
							|  |  |  |         return | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |     if segment.status != "completed": | 
					
						
							| 
									
										
										
										
											2025-04-07 20:31:26 +08:00
										 |  |  |         logging.info(click.style("Segment is not completed, enable is not allowed: {}".format(segment_id), fg="red")) | 
					
						
							|  |  |  |         db.session.close() | 
					
						
							|  |  |  |         return | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |     indexing_cache_key = "segment_{}_indexing".format(segment.id) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     try: | 
					
						
							| 
									
										
										
										
											2023-06-25 16:49:14 +08:00
										 |  |  |         document = Document( | 
					
						
							|  |  |  |             page_content=segment.content, | 
					
						
							|  |  |  |             metadata={ | 
					
						
							|  |  |  |                 "doc_id": segment.index_node_id, | 
					
						
							|  |  |  |                 "doc_hash": segment.index_node_hash, | 
					
						
							|  |  |  |                 "document_id": segment.document_id, | 
					
						
							|  |  |  |                 "dataset_id": segment.dataset_id, | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |             }, | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |         ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         dataset = segment.dataset | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         if not dataset: | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |             logging.info(click.style("Segment {} has no dataset, pass.".format(segment.id), fg="cyan")) | 
					
						
							| 
									
										
										
										
											2023-06-25 16:49:14 +08:00
										 |  |  |             return | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-06-25 16:49:14 +08:00
										 |  |  |         dataset_document = segment.document | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         if not dataset_document: | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |             logging.info(click.style("Segment {} has no document, pass.".format(segment.id), fg="cyan")) | 
					
						
							| 
									
										
										
										
											2023-06-25 16:49:14 +08:00
										 |  |  |             return | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |         if not dataset_document.enabled or dataset_document.archived or dataset_document.indexing_status != "completed": | 
					
						
							|  |  |  |             logging.info(click.style("Segment {} document status is invalid, pass.".format(segment.id), fg="cyan")) | 
					
						
							| 
									
										
										
										
											2023-06-25 16:49:14 +08:00
										 |  |  |             return | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-22 23:31:57 +08:00
										 |  |  |         index_processor = IndexProcessorFactory(dataset_document.doc_form).init_index_processor() | 
					
						
							| 
									
										
										
										
											2024-12-25 19:49:07 +08:00
										 |  |  |         if dataset_document.doc_form == IndexType.PARENT_CHILD_INDEX: | 
					
						
							| 
									
										
										
										
											2025-03-25 16:26:14 +08:00
										 |  |  |             child_chunks = segment.get_child_chunks() | 
					
						
							| 
									
										
										
										
											2024-12-25 19:49:07 +08:00
										 |  |  |             if child_chunks: | 
					
						
							|  |  |  |                 child_documents = [] | 
					
						
							|  |  |  |                 for child_chunk in child_chunks: | 
					
						
							|  |  |  |                     child_document = ChildDocument( | 
					
						
							|  |  |  |                         page_content=child_chunk.content, | 
					
						
							|  |  |  |                         metadata={ | 
					
						
							|  |  |  |                             "doc_id": child_chunk.index_node_id, | 
					
						
							|  |  |  |                             "doc_hash": child_chunk.index_node_hash, | 
					
						
							|  |  |  |                             "document_id": segment.document_id, | 
					
						
							|  |  |  |                             "dataset_id": segment.dataset_id, | 
					
						
							|  |  |  |                         }, | 
					
						
							|  |  |  |                     ) | 
					
						
							|  |  |  |                     child_documents.append(child_document) | 
					
						
							|  |  |  |                 document.children = child_documents | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |         # save vector index | 
					
						
							| 
									
										
										
										
											2024-02-22 23:31:57 +08:00
										 |  |  |         index_processor.load(dataset, [document]) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |         end_at = time.perf_counter() | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |         logging.info( | 
					
						
							|  |  |  |             click.style("Segment enabled to index: {} latency: {}".format(segment.id, end_at - start_at), fg="green") | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     except Exception as e: | 
					
						
							| 
									
										
										
										
											2023-07-28 20:47:15 +08:00
										 |  |  |         logging.exception("enable segment to index failed") | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |         segment.enabled = False | 
					
						
							| 
									
										
										
										
											2024-11-24 13:28:46 +08:00
										 |  |  |         segment.disabled_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None) | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |         segment.status = "error" | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |         segment.error = str(e) | 
					
						
							|  |  |  |         db.session.commit() | 
					
						
							|  |  |  |     finally: | 
					
						
							|  |  |  |         redis_client.delete(indexing_cache_key) | 
					
						
							| 
									
										
										
										
											2025-04-07 20:31:26 +08:00
										 |  |  |         db.session.close() |