| 
									
										
										
										
											2023-05-23 22:54:59 +08:00
										 |  |  | import logging | 
					
						
							|  |  |  | import time | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | import click | 
					
						
							| 
									
										
										
										
											2024-12-24 18:38:51 +08:00
										 |  |  | from celery import shared_task  # type: ignore | 
					
						
							| 
									
										
										
										
											2024-02-06 13:21:13 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-25 19:49:07 +08:00
										 |  |  | from core.rag.index_processor.constant.index_type import IndexType | 
					
						
							| 
									
										
										
										
											2024-02-22 23:31:57 +08:00
										 |  |  | from core.rag.index_processor.index_processor_factory import IndexProcessorFactory | 
					
						
							| 
									
										
										
										
											2024-12-25 19:49:07 +08:00
										 |  |  | from core.rag.models.document import ChildDocument, Document | 
					
						
							| 
									
										
										
										
											2023-05-23 22:54:59 +08:00
										 |  |  | from extensions.ext_database import db | 
					
						
							| 
									
										
										
										
											2024-02-06 13:21:13 +08:00
										 |  |  | from models.dataset import Dataset, DocumentSegment | 
					
						
							| 
									
										
										
										
											2023-06-25 16:49:14 +08:00
										 |  |  | from models.dataset import Document as DatasetDocument | 
					
						
							| 
									
										
										
										
											2023-05-23 22:54:59 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  | @shared_task(queue="dataset") | 
					
						
							| 
									
										
										
										
											2023-05-23 22:54:59 +08:00
										 |  |  | def deal_dataset_vector_index_task(dataset_id: str, action: str): | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     Async deal dataset from index | 
					
						
							|  |  |  |     :param dataset_id: dataset_id | 
					
						
							|  |  |  |     :param action: action | 
					
						
							|  |  |  |     Usage: deal_dataset_vector_index_task.delay(dataset_id, action) | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |     logging.info(click.style("Start deal dataset vector index: {}".format(dataset_id), fg="green")) | 
					
						
							| 
									
										
										
										
											2023-05-23 22:54:59 +08:00
										 |  |  |     start_at = time.perf_counter() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     try: | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |         dataset = Dataset.query.filter_by(id=dataset_id).first() | 
					
						
							| 
									
										
										
										
											2023-06-25 16:49:14 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-05-23 22:54:59 +08:00
										 |  |  |         if not dataset: | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |             raise Exception("Dataset not found") | 
					
						
							| 
									
										
										
										
											2025-01-08 18:26:05 +08:00
										 |  |  |         index_type = dataset.doc_form or IndexType.PARAGRAPH_INDEX | 
					
						
							| 
									
										
										
										
											2024-02-22 23:31:57 +08:00
										 |  |  |         index_processor = IndexProcessorFactory(index_type).init_index_processor() | 
					
						
							| 
									
										
										
										
											2023-06-25 16:49:14 +08:00
										 |  |  |         if action == "remove": | 
					
						
							| 
									
										
										
										
											2024-02-22 23:31:57 +08:00
										 |  |  |             index_processor.clean(dataset, None, with_keywords=False) | 
					
						
							| 
									
										
										
										
											2023-06-25 16:49:14 +08:00
										 |  |  |         elif action == "add": | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |             dataset_documents = ( | 
					
						
							|  |  |  |                 db.session.query(DatasetDocument) | 
					
						
							|  |  |  |                 .filter( | 
					
						
							|  |  |  |                     DatasetDocument.dataset_id == dataset_id, | 
					
						
							|  |  |  |                     DatasetDocument.indexing_status == "completed", | 
					
						
							|  |  |  |                     DatasetDocument.enabled == True, | 
					
						
							|  |  |  |                     DatasetDocument.archived == False, | 
					
						
							|  |  |  |                 ) | 
					
						
							|  |  |  |                 .all() | 
					
						
							|  |  |  |             ) | 
					
						
							| 
									
										
										
										
											2023-06-25 16:49:14 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |             if dataset_documents: | 
					
						
							| 
									
										
										
										
											2024-08-09 16:47:15 +08:00
										 |  |  |                 dataset_documents_ids = [doc.id for doc in dataset_documents] | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |                 db.session.query(DatasetDocument).filter(DatasetDocument.id.in_(dataset_documents_ids)).update( | 
					
						
							|  |  |  |                     {"indexing_status": "indexing"}, synchronize_session=False | 
					
						
							|  |  |  |                 ) | 
					
						
							| 
									
										
										
										
											2024-08-09 16:47:15 +08:00
										 |  |  |                 db.session.commit() | 
					
						
							| 
									
										
										
										
											2023-05-23 22:54:59 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-08-09 16:47:15 +08:00
										 |  |  |                 for dataset_document in dataset_documents: | 
					
						
							|  |  |  |                     try: | 
					
						
							|  |  |  |                         # add from vector index | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |                         segments = ( | 
					
						
							|  |  |  |                             db.session.query(DocumentSegment) | 
					
						
							|  |  |  |                             .filter(DocumentSegment.document_id == dataset_document.id, DocumentSegment.enabled == True) | 
					
						
							|  |  |  |                             .order_by(DocumentSegment.position.asc()) | 
					
						
							|  |  |  |                             .all() | 
					
						
							|  |  |  |                         ) | 
					
						
							| 
									
										
										
										
											2024-08-09 16:47:15 +08:00
										 |  |  |                         if segments: | 
					
						
							|  |  |  |                             documents = [] | 
					
						
							|  |  |  |                             for segment in segments: | 
					
						
							|  |  |  |                                 document = Document( | 
					
						
							|  |  |  |                                     page_content=segment.content, | 
					
						
							|  |  |  |                                     metadata={ | 
					
						
							|  |  |  |                                         "doc_id": segment.index_node_id, | 
					
						
							|  |  |  |                                         "doc_hash": segment.index_node_hash, | 
					
						
							|  |  |  |                                         "document_id": segment.document_id, | 
					
						
							|  |  |  |                                         "dataset_id": segment.dataset_id, | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |                                     }, | 
					
						
							| 
									
										
										
										
											2024-08-09 16:47:15 +08:00
										 |  |  |                                 ) | 
					
						
							| 
									
										
										
										
											2023-06-25 16:49:14 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-08-09 16:47:15 +08:00
										 |  |  |                                 documents.append(document) | 
					
						
							|  |  |  |                             # save vector index | 
					
						
							|  |  |  |                             index_processor.load(dataset, documents, with_keywords=False) | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |                         db.session.query(DatasetDocument).filter(DatasetDocument.id == dataset_document.id).update( | 
					
						
							|  |  |  |                             {"indexing_status": "completed"}, synchronize_session=False | 
					
						
							|  |  |  |                         ) | 
					
						
							| 
									
										
										
										
											2024-08-09 16:47:15 +08:00
										 |  |  |                         db.session.commit() | 
					
						
							|  |  |  |                     except Exception as e: | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |                         db.session.query(DatasetDocument).filter(DatasetDocument.id == dataset_document.id).update( | 
					
						
							|  |  |  |                             {"indexing_status": "error", "error": str(e)}, synchronize_session=False | 
					
						
							|  |  |  |                         ) | 
					
						
							| 
									
										
										
										
											2024-08-09 16:47:15 +08:00
										 |  |  |                         db.session.commit() | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |         elif action == "update": | 
					
						
							|  |  |  |             dataset_documents = ( | 
					
						
							|  |  |  |                 db.session.query(DatasetDocument) | 
					
						
							|  |  |  |                 .filter( | 
					
						
							|  |  |  |                     DatasetDocument.dataset_id == dataset_id, | 
					
						
							|  |  |  |                     DatasetDocument.indexing_status == "completed", | 
					
						
							|  |  |  |                     DatasetDocument.enabled == True, | 
					
						
							|  |  |  |                     DatasetDocument.archived == False, | 
					
						
							|  |  |  |                 ) | 
					
						
							|  |  |  |                 .all() | 
					
						
							|  |  |  |             ) | 
					
						
							| 
									
										
										
										
											2024-04-24 15:02:29 +08:00
										 |  |  |             # add new index | 
					
						
							|  |  |  |             if dataset_documents: | 
					
						
							| 
									
										
										
										
											2024-08-09 16:47:15 +08:00
										 |  |  |                 # update document status | 
					
						
							|  |  |  |                 dataset_documents_ids = [doc.id for doc in dataset_documents] | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |                 db.session.query(DatasetDocument).filter(DatasetDocument.id.in_(dataset_documents_ids)).update( | 
					
						
							|  |  |  |                     {"indexing_status": "indexing"}, synchronize_session=False | 
					
						
							|  |  |  |                 ) | 
					
						
							| 
									
										
										
										
											2024-08-09 16:47:15 +08:00
										 |  |  |                 db.session.commit() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                 # clean index | 
					
						
							| 
									
										
										
										
											2024-12-25 19:49:07 +08:00
										 |  |  |                 index_processor.clean(dataset, None, with_keywords=False, delete_child_chunks=False) | 
					
						
							| 
									
										
										
										
											2024-08-09 16:47:15 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-24 15:02:29 +08:00
										 |  |  |                 for dataset_document in dataset_documents: | 
					
						
							| 
									
										
										
										
											2024-08-09 16:47:15 +08:00
										 |  |  |                     # update from vector index | 
					
						
							|  |  |  |                     try: | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |                         segments = ( | 
					
						
							|  |  |  |                             db.session.query(DocumentSegment) | 
					
						
							|  |  |  |                             .filter(DocumentSegment.document_id == dataset_document.id, DocumentSegment.enabled == True) | 
					
						
							|  |  |  |                             .order_by(DocumentSegment.position.asc()) | 
					
						
							|  |  |  |                             .all() | 
					
						
							|  |  |  |                         ) | 
					
						
							| 
									
										
										
										
											2024-08-09 16:47:15 +08:00
										 |  |  |                         if segments: | 
					
						
							|  |  |  |                             documents = [] | 
					
						
							|  |  |  |                             for segment in segments: | 
					
						
							|  |  |  |                                 document = Document( | 
					
						
							|  |  |  |                                     page_content=segment.content, | 
					
						
							|  |  |  |                                     metadata={ | 
					
						
							|  |  |  |                                         "doc_id": segment.index_node_id, | 
					
						
							|  |  |  |                                         "doc_hash": segment.index_node_hash, | 
					
						
							|  |  |  |                                         "document_id": segment.document_id, | 
					
						
							|  |  |  |                                         "dataset_id": segment.dataset_id, | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |                                     }, | 
					
						
							| 
									
										
										
										
											2024-08-09 16:47:15 +08:00
										 |  |  |                                 ) | 
					
						
							| 
									
										
										
										
											2024-12-25 19:49:07 +08:00
										 |  |  |                                 if dataset_document.doc_form == IndexType.PARENT_CHILD_INDEX: | 
					
						
							| 
									
										
										
										
											2025-03-25 16:26:14 +08:00
										 |  |  |                                     child_chunks = segment.get_child_chunks() | 
					
						
							| 
									
										
										
										
											2024-12-25 19:49:07 +08:00
										 |  |  |                                     if child_chunks: | 
					
						
							|  |  |  |                                         child_documents = [] | 
					
						
							|  |  |  |                                         for child_chunk in child_chunks: | 
					
						
							|  |  |  |                                             child_document = ChildDocument( | 
					
						
							|  |  |  |                                                 page_content=child_chunk.content, | 
					
						
							|  |  |  |                                                 metadata={ | 
					
						
							|  |  |  |                                                     "doc_id": child_chunk.index_node_id, | 
					
						
							|  |  |  |                                                     "doc_hash": child_chunk.index_node_hash, | 
					
						
							|  |  |  |                                                     "document_id": segment.document_id, | 
					
						
							|  |  |  |                                                     "dataset_id": segment.dataset_id, | 
					
						
							|  |  |  |                                                 }, | 
					
						
							|  |  |  |                                             ) | 
					
						
							|  |  |  |                                             child_documents.append(child_document) | 
					
						
							|  |  |  |                                         document.children = child_documents | 
					
						
							| 
									
										
										
										
											2024-08-09 16:47:15 +08:00
										 |  |  |                                 documents.append(document) | 
					
						
							|  |  |  |                             # save vector index | 
					
						
							|  |  |  |                             index_processor.load(dataset, documents, with_keywords=False) | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |                         db.session.query(DatasetDocument).filter(DatasetDocument.id == dataset_document.id).update( | 
					
						
							|  |  |  |                             {"indexing_status": "completed"}, synchronize_session=False | 
					
						
							|  |  |  |                         ) | 
					
						
							| 
									
										
										
										
											2024-08-09 16:47:15 +08:00
										 |  |  |                         db.session.commit() | 
					
						
							|  |  |  |                     except Exception as e: | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |                         db.session.query(DatasetDocument).filter(DatasetDocument.id == dataset_document.id).update( | 
					
						
							|  |  |  |                             {"indexing_status": "error", "error": str(e)}, synchronize_session=False | 
					
						
							|  |  |  |                         ) | 
					
						
							| 
									
										
										
										
											2024-08-09 16:47:15 +08:00
										 |  |  |                         db.session.commit() | 
					
						
							| 
									
										
										
										
											2025-01-08 18:26:05 +08:00
										 |  |  |             else: | 
					
						
							|  |  |  |                 # clean collection | 
					
						
							|  |  |  |                 index_processor.clean(dataset, None, with_keywords=False, delete_child_chunks=False) | 
					
						
							| 
									
										
										
										
											2024-04-24 15:02:29 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-05-23 22:54:59 +08:00
										 |  |  |         end_at = time.perf_counter() | 
					
						
							|  |  |  |         logging.info( | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |             click.style("Deal dataset vector index: {} latency: {}".format(dataset_id, end_at - start_at), fg="green") | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2023-05-23 22:54:59 +08:00
										 |  |  |     except Exception: | 
					
						
							|  |  |  |         logging.exception("Deal dataset vector index failed") | 
					
						
							| 
									
										
										
										
											2025-04-07 20:31:26 +08:00
										 |  |  |     finally: | 
					
						
							|  |  |  |         db.session.close() |