| 
									
										
										
										
											2023-05-23 22:54:59 +08:00
										 |  |  | import logging | 
					
						
							|  |  |  | import time | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | import click | 
					
						
							|  |  |  | from celery import shared_task | 
					
						
							| 
									
										
										
										
											2024-02-06 13:21:13 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-22 23:31:57 +08:00
										 |  |  | from core.rag.index_processor.index_processor_factory import IndexProcessorFactory | 
					
						
							|  |  |  | from core.rag.models.document import Document | 
					
						
							| 
									
										
										
										
											2023-05-23 22:54:59 +08:00
										 |  |  | from extensions.ext_database import db | 
					
						
							| 
									
										
										
										
											2024-02-06 13:21:13 +08:00
										 |  |  | from models.dataset import Dataset, DocumentSegment | 
					
						
							| 
									
										
										
										
											2023-06-25 16:49:14 +08:00
										 |  |  | from models.dataset import Document as DatasetDocument | 
					
						
							| 
									
										
										
										
											2023-05-23 22:54:59 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  | @shared_task(queue="dataset") | 
					
						
							| 
									
										
										
										
											2023-05-23 22:54:59 +08:00
										 |  |  | def deal_dataset_vector_index_task(dataset_id: str, action: str): | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     Async deal dataset from index | 
					
						
							|  |  |  |     :param dataset_id: dataset_id | 
					
						
							|  |  |  |     :param action: action | 
					
						
							|  |  |  |     Usage: deal_dataset_vector_index_task.delay(dataset_id, action) | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |     logging.info(click.style("Start deal dataset vector index: {}".format(dataset_id), fg="green")) | 
					
						
							| 
									
										
										
										
											2023-05-23 22:54:59 +08:00
										 |  |  |     start_at = time.perf_counter() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     try: | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |         dataset = Dataset.query.filter_by(id=dataset_id).first() | 
					
						
							| 
									
										
										
										
											2023-06-25 16:49:14 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-05-23 22:54:59 +08:00
										 |  |  |         if not dataset: | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |             raise Exception("Dataset not found") | 
					
						
							| 
									
										
										
										
											2024-02-22 23:31:57 +08:00
										 |  |  |         index_type = dataset.doc_form | 
					
						
							|  |  |  |         index_processor = IndexProcessorFactory(index_type).init_index_processor() | 
					
						
							| 
									
										
										
										
											2023-06-25 16:49:14 +08:00
										 |  |  |         if action == "remove": | 
					
						
							| 
									
										
										
										
											2024-02-22 23:31:57 +08:00
										 |  |  |             index_processor.clean(dataset, None, with_keywords=False) | 
					
						
							| 
									
										
										
										
											2023-06-25 16:49:14 +08:00
										 |  |  |         elif action == "add": | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |             dataset_documents = ( | 
					
						
							|  |  |  |                 db.session.query(DatasetDocument) | 
					
						
							|  |  |  |                 .filter( | 
					
						
							|  |  |  |                     DatasetDocument.dataset_id == dataset_id, | 
					
						
							|  |  |  |                     DatasetDocument.indexing_status == "completed", | 
					
						
							|  |  |  |                     DatasetDocument.enabled == True, | 
					
						
							|  |  |  |                     DatasetDocument.archived == False, | 
					
						
							|  |  |  |                 ) | 
					
						
							|  |  |  |                 .all() | 
					
						
							|  |  |  |             ) | 
					
						
							| 
									
										
										
										
											2023-06-25 16:49:14 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |             if dataset_documents: | 
					
						
							| 
									
										
										
										
											2024-08-09 16:47:15 +08:00
										 |  |  |                 dataset_documents_ids = [doc.id for doc in dataset_documents] | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |                 db.session.query(DatasetDocument).filter(DatasetDocument.id.in_(dataset_documents_ids)).update( | 
					
						
							|  |  |  |                     {"indexing_status": "indexing"}, synchronize_session=False | 
					
						
							|  |  |  |                 ) | 
					
						
							| 
									
										
										
										
											2024-08-09 16:47:15 +08:00
										 |  |  |                 db.session.commit() | 
					
						
							| 
									
										
										
										
											2023-05-23 22:54:59 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-08-09 16:47:15 +08:00
										 |  |  |                 for dataset_document in dataset_documents: | 
					
						
							|  |  |  |                     try: | 
					
						
							|  |  |  |                         # add from vector index | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |                         segments = ( | 
					
						
							|  |  |  |                             db.session.query(DocumentSegment) | 
					
						
							|  |  |  |                             .filter(DocumentSegment.document_id == dataset_document.id, DocumentSegment.enabled == True) | 
					
						
							|  |  |  |                             .order_by(DocumentSegment.position.asc()) | 
					
						
							|  |  |  |                             .all() | 
					
						
							|  |  |  |                         ) | 
					
						
							| 
									
										
										
										
											2024-08-09 16:47:15 +08:00
										 |  |  |                         if segments: | 
					
						
							|  |  |  |                             documents = [] | 
					
						
							|  |  |  |                             for segment in segments: | 
					
						
							|  |  |  |                                 document = Document( | 
					
						
							|  |  |  |                                     page_content=segment.content, | 
					
						
							|  |  |  |                                     metadata={ | 
					
						
							|  |  |  |                                         "doc_id": segment.index_node_id, | 
					
						
							|  |  |  |                                         "doc_hash": segment.index_node_hash, | 
					
						
							|  |  |  |                                         "document_id": segment.document_id, | 
					
						
							|  |  |  |                                         "dataset_id": segment.dataset_id, | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |                                     }, | 
					
						
							| 
									
										
										
										
											2024-08-09 16:47:15 +08:00
										 |  |  |                                 ) | 
					
						
							| 
									
										
										
										
											2023-06-25 16:49:14 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-08-09 16:47:15 +08:00
										 |  |  |                                 documents.append(document) | 
					
						
							|  |  |  |                             # save vector index | 
					
						
							|  |  |  |                             index_processor.load(dataset, documents, with_keywords=False) | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |                         db.session.query(DatasetDocument).filter(DatasetDocument.id == dataset_document.id).update( | 
					
						
							|  |  |  |                             {"indexing_status": "completed"}, synchronize_session=False | 
					
						
							|  |  |  |                         ) | 
					
						
							| 
									
										
										
										
											2024-08-09 16:47:15 +08:00
										 |  |  |                         db.session.commit() | 
					
						
							|  |  |  |                     except Exception as e: | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |                         db.session.query(DatasetDocument).filter(DatasetDocument.id == dataset_document.id).update( | 
					
						
							|  |  |  |                             {"indexing_status": "error", "error": str(e)}, synchronize_session=False | 
					
						
							|  |  |  |                         ) | 
					
						
							| 
									
										
										
										
											2024-08-09 16:47:15 +08:00
										 |  |  |                         db.session.commit() | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |         elif action == "update": | 
					
						
							|  |  |  |             dataset_documents = ( | 
					
						
							|  |  |  |                 db.session.query(DatasetDocument) | 
					
						
							|  |  |  |                 .filter( | 
					
						
							|  |  |  |                     DatasetDocument.dataset_id == dataset_id, | 
					
						
							|  |  |  |                     DatasetDocument.indexing_status == "completed", | 
					
						
							|  |  |  |                     DatasetDocument.enabled == True, | 
					
						
							|  |  |  |                     DatasetDocument.archived == False, | 
					
						
							|  |  |  |                 ) | 
					
						
							|  |  |  |                 .all() | 
					
						
							|  |  |  |             ) | 
					
						
							| 
									
										
										
										
											2024-04-24 15:02:29 +08:00
										 |  |  |             # add new index | 
					
						
							|  |  |  |             if dataset_documents: | 
					
						
							| 
									
										
										
										
											2024-08-09 16:47:15 +08:00
										 |  |  |                 # update document status | 
					
						
							|  |  |  |                 dataset_documents_ids = [doc.id for doc in dataset_documents] | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |                 db.session.query(DatasetDocument).filter(DatasetDocument.id.in_(dataset_documents_ids)).update( | 
					
						
							|  |  |  |                     {"indexing_status": "indexing"}, synchronize_session=False | 
					
						
							|  |  |  |                 ) | 
					
						
							| 
									
										
										
										
											2024-08-09 16:47:15 +08:00
										 |  |  |                 db.session.commit() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                 # clean index | 
					
						
							|  |  |  |                 index_processor.clean(dataset, None, with_keywords=False) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-24 15:02:29 +08:00
										 |  |  |                 for dataset_document in dataset_documents: | 
					
						
							| 
									
										
										
										
											2024-08-09 16:47:15 +08:00
										 |  |  |                     # update from vector index | 
					
						
							|  |  |  |                     try: | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |                         segments = ( | 
					
						
							|  |  |  |                             db.session.query(DocumentSegment) | 
					
						
							|  |  |  |                             .filter(DocumentSegment.document_id == dataset_document.id, DocumentSegment.enabled == True) | 
					
						
							|  |  |  |                             .order_by(DocumentSegment.position.asc()) | 
					
						
							|  |  |  |                             .all() | 
					
						
							|  |  |  |                         ) | 
					
						
							| 
									
										
										
										
											2024-08-09 16:47:15 +08:00
										 |  |  |                         if segments: | 
					
						
							|  |  |  |                             documents = [] | 
					
						
							|  |  |  |                             for segment in segments: | 
					
						
							|  |  |  |                                 document = Document( | 
					
						
							|  |  |  |                                     page_content=segment.content, | 
					
						
							|  |  |  |                                     metadata={ | 
					
						
							|  |  |  |                                         "doc_id": segment.index_node_id, | 
					
						
							|  |  |  |                                         "doc_hash": segment.index_node_hash, | 
					
						
							|  |  |  |                                         "document_id": segment.document_id, | 
					
						
							|  |  |  |                                         "dataset_id": segment.dataset_id, | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |                                     }, | 
					
						
							| 
									
										
										
										
											2024-08-09 16:47:15 +08:00
										 |  |  |                                 ) | 
					
						
							| 
									
										
										
										
											2024-04-24 15:02:29 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-08-09 16:47:15 +08:00
										 |  |  |                                 documents.append(document) | 
					
						
							|  |  |  |                             # save vector index | 
					
						
							|  |  |  |                             index_processor.load(dataset, documents, with_keywords=False) | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |                         db.session.query(DatasetDocument).filter(DatasetDocument.id == dataset_document.id).update( | 
					
						
							|  |  |  |                             {"indexing_status": "completed"}, synchronize_session=False | 
					
						
							|  |  |  |                         ) | 
					
						
							| 
									
										
										
										
											2024-08-09 16:47:15 +08:00
										 |  |  |                         db.session.commit() | 
					
						
							|  |  |  |                     except Exception as e: | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |                         db.session.query(DatasetDocument).filter(DatasetDocument.id == dataset_document.id).update( | 
					
						
							|  |  |  |                             {"indexing_status": "error", "error": str(e)}, synchronize_session=False | 
					
						
							|  |  |  |                         ) | 
					
						
							| 
									
										
										
										
											2024-08-09 16:47:15 +08:00
										 |  |  |                         db.session.commit() | 
					
						
							| 
									
										
										
										
											2024-04-24 15:02:29 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-05-23 22:54:59 +08:00
										 |  |  |         end_at = time.perf_counter() | 
					
						
							|  |  |  |         logging.info( | 
					
						
							| 
									
										
										
										
											2024-08-26 13:38:37 +08:00
										 |  |  |             click.style("Deal dataset vector index: {} latency: {}".format(dataset_id, end_at - start_at), fg="green") | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2023-05-23 22:54:59 +08:00
										 |  |  |     except Exception: | 
					
						
							|  |  |  |         logging.exception("Deal dataset vector index failed") |