| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | import json | 
					
						
							|  |  |  | import pickle | 
					
						
							|  |  |  | from json import JSONDecodeError | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | from sqlalchemy import func | 
					
						
							| 
									
										
										
										
											2023-11-17 22:13:37 +08:00
										 |  |  | from sqlalchemy.dialects.postgresql import UUID, JSONB | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | from extensions.ext_database import db | 
					
						
							|  |  |  | from models.account import Account | 
					
						
							|  |  |  | from models.model import App, UploadFile | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-12 00:57:00 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | class Dataset(db.Model): | 
					
						
							|  |  |  |     __tablename__ = 'datasets' | 
					
						
							|  |  |  |     __table_args__ = ( | 
					
						
							|  |  |  |         db.PrimaryKeyConstraint('id', name='dataset_pkey'), | 
					
						
							|  |  |  |         db.Index('dataset_tenant_idx', 'tenant_id'), | 
					
						
							| 
									
										
										
										
											2023-11-17 22:13:37 +08:00
										 |  |  |         db.Index('retrieval_model_idx', "retrieval_model", postgresql_using='gin') | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     INDEXING_TECHNIQUE_LIST = ['high_quality', 'economy'] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     id = db.Column(UUID, server_default=db.text('uuid_generate_v4()')) | 
					
						
							|  |  |  |     tenant_id = db.Column(UUID, nullable=False) | 
					
						
							|  |  |  |     name = db.Column(db.String(255), nullable=False) | 
					
						
							|  |  |  |     description = db.Column(db.Text, nullable=True) | 
					
						
							|  |  |  |     provider = db.Column(db.String(255), nullable=False, | 
					
						
							|  |  |  |                          server_default=db.text("'vendor'::character varying")) | 
					
						
							|  |  |  |     permission = db.Column(db.String(255), nullable=False, | 
					
						
							|  |  |  |                            server_default=db.text("'only_me'::character varying")) | 
					
						
							|  |  |  |     data_source_type = db.Column(db.String(255)) | 
					
						
							|  |  |  |     indexing_technique = db.Column(db.String(255), nullable=True) | 
					
						
							|  |  |  |     index_struct = db.Column(db.Text, nullable=True) | 
					
						
							|  |  |  |     created_by = db.Column(UUID, nullable=False) | 
					
						
							|  |  |  |     created_at = db.Column(db.DateTime, nullable=False, | 
					
						
							|  |  |  |                            server_default=db.text('CURRENT_TIMESTAMP(0)')) | 
					
						
							|  |  |  |     updated_by = db.Column(UUID, nullable=True) | 
					
						
							|  |  |  |     updated_at = db.Column(db.DateTime, nullable=False, | 
					
						
							|  |  |  |                            server_default=db.text('CURRENT_TIMESTAMP(0)')) | 
					
						
							| 
									
										
										
										
											2023-08-29 03:37:45 +08:00
										 |  |  |     embedding_model = db.Column(db.String(255), nullable=True) | 
					
						
							|  |  |  |     embedding_model_provider = db.Column(db.String(255), nullable=True) | 
					
						
							| 
									
										
										
										
											2023-09-18 18:15:41 +08:00
										 |  |  |     collection_binding_id = db.Column(UUID, nullable=True) | 
					
						
							| 
									
										
										
										
											2023-11-17 22:13:37 +08:00
										 |  |  |     retrieval_model = db.Column(JSONB, nullable=True) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def dataset_keyword_table(self): | 
					
						
							|  |  |  |         dataset_keyword_table = db.session.query(DatasetKeywordTable).filter( | 
					
						
							|  |  |  |             DatasetKeywordTable.dataset_id == self.id).first() | 
					
						
							|  |  |  |         if dataset_keyword_table: | 
					
						
							|  |  |  |             return dataset_keyword_table | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         return None | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def index_struct_dict(self): | 
					
						
							|  |  |  |         return json.loads(self.index_struct) if self.index_struct else None | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def created_by_account(self): | 
					
						
							|  |  |  |         return Account.query.get(self.created_by) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def latest_process_rule(self): | 
					
						
							|  |  |  |         return DatasetProcessRule.query.filter(DatasetProcessRule.dataset_id == self.id) \ | 
					
						
							|  |  |  |             .order_by(DatasetProcessRule.created_at.desc()).first() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def app_count(self): | 
					
						
							|  |  |  |         return db.session.query(func.count(AppDatasetJoin.id)).filter(AppDatasetJoin.dataset_id == self.id).scalar() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def document_count(self): | 
					
						
							|  |  |  |         return db.session.query(func.count(Document.id)).filter(Document.dataset_id == self.id).scalar() | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-06-25 16:49:14 +08:00
										 |  |  |     @property | 
					
						
							|  |  |  |     def available_document_count(self): | 
					
						
							|  |  |  |         return db.session.query(func.count(Document.id)).filter( | 
					
						
							|  |  |  |             Document.dataset_id == self.id, | 
					
						
							|  |  |  |             Document.indexing_status == 'completed', | 
					
						
							|  |  |  |             Document.enabled == True, | 
					
						
							|  |  |  |             Document.archived == False | 
					
						
							|  |  |  |         ).scalar() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def available_segment_count(self): | 
					
						
							|  |  |  |         return db.session.query(func.count(DocumentSegment.id)).filter( | 
					
						
							|  |  |  |             DocumentSegment.dataset_id == self.id, | 
					
						
							|  |  |  |             DocumentSegment.status == 'completed', | 
					
						
							|  |  |  |             DocumentSegment.enabled == True | 
					
						
							|  |  |  |         ).scalar() | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     @property | 
					
						
							|  |  |  |     def word_count(self): | 
					
						
							|  |  |  |         return Document.query.with_entities(func.coalesce(func.sum(Document.word_count))) \ | 
					
						
							|  |  |  |             .filter(Document.dataset_id == self.id).scalar() | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-11-17 22:13:37 +08:00
										 |  |  |     @property | 
					
						
							|  |  |  |     def retrieval_model_dict(self): | 
					
						
							|  |  |  |         default_retrieval_model = { | 
					
						
							|  |  |  |             'search_method': 'semantic_search', | 
					
						
							|  |  |  |             'reranking_enable': False, | 
					
						
							|  |  |  |             'reranking_model': { | 
					
						
							|  |  |  |                 'reranking_provider_name': '', | 
					
						
							|  |  |  |                 'reranking_model_name': '' | 
					
						
							|  |  |  |             }, | 
					
						
							|  |  |  |             'top_k': 2, | 
					
						
							| 
									
										
										
										
											2023-11-27 15:34:45 +08:00
										 |  |  |             'score_threshold_enabled': False | 
					
						
							| 
									
										
										
										
											2023-11-17 22:13:37 +08:00
										 |  |  |         } | 
					
						
							|  |  |  |         return self.retrieval_model if self.retrieval_model else default_retrieval_model | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | class DatasetProcessRule(db.Model): | 
					
						
							|  |  |  |     __tablename__ = 'dataset_process_rules' | 
					
						
							|  |  |  |     __table_args__ = ( | 
					
						
							|  |  |  |         db.PrimaryKeyConstraint('id', name='dataset_process_rule_pkey'), | 
					
						
							|  |  |  |         db.Index('dataset_process_rule_dataset_id_idx', 'dataset_id'), | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     id = db.Column(UUID, nullable=False, | 
					
						
							|  |  |  |                    server_default=db.text('uuid_generate_v4()')) | 
					
						
							|  |  |  |     dataset_id = db.Column(UUID, nullable=False) | 
					
						
							|  |  |  |     mode = db.Column(db.String(255), nullable=False, | 
					
						
							|  |  |  |                      server_default=db.text("'automatic'::character varying")) | 
					
						
							|  |  |  |     rules = db.Column(db.Text, nullable=True) | 
					
						
							|  |  |  |     created_by = db.Column(UUID, nullable=False) | 
					
						
							|  |  |  |     created_at = db.Column(db.DateTime, nullable=False, | 
					
						
							|  |  |  |                            server_default=db.text('CURRENT_TIMESTAMP(0)')) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     MODES = ['automatic', 'custom'] | 
					
						
							|  |  |  |     PRE_PROCESSING_RULES = ['remove_stopwords', 'remove_extra_spaces', 'remove_urls_emails'] | 
					
						
							|  |  |  |     AUTOMATIC_RULES = { | 
					
						
							|  |  |  |         'pre_processing_rules': [ | 
					
						
							|  |  |  |             {'id': 'remove_extra_spaces', 'enabled': True}, | 
					
						
							|  |  |  |             {'id': 'remove_urls_emails', 'enabled': False} | 
					
						
							|  |  |  |         ], | 
					
						
							|  |  |  |         'segmentation': { | 
					
						
							|  |  |  |             'delimiter': '\n', | 
					
						
							| 
									
										
										
										
											2023-12-18 23:24:06 +08:00
										 |  |  |             'max_tokens': 1000 | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def to_dict(self): | 
					
						
							|  |  |  |         return { | 
					
						
							|  |  |  |             'id': self.id, | 
					
						
							|  |  |  |             'dataset_id': self.dataset_id, | 
					
						
							|  |  |  |             'mode': self.mode, | 
					
						
							|  |  |  |             'rules': self.rules_dict, | 
					
						
							|  |  |  |             'created_by': self.created_by, | 
					
						
							|  |  |  |             'created_at': self.created_at, | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def rules_dict(self): | 
					
						
							|  |  |  |         try: | 
					
						
							|  |  |  |             return json.loads(self.rules) if self.rules else None | 
					
						
							|  |  |  |         except JSONDecodeError: | 
					
						
							|  |  |  |             return None | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class Document(db.Model): | 
					
						
							|  |  |  |     __tablename__ = 'documents' | 
					
						
							|  |  |  |     __table_args__ = ( | 
					
						
							|  |  |  |         db.PrimaryKeyConstraint('id', name='document_pkey'), | 
					
						
							|  |  |  |         db.Index('document_dataset_id_idx', 'dataset_id'), | 
					
						
							|  |  |  |         db.Index('document_is_paused_idx', 'is_paused'), | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # initial fields | 
					
						
							|  |  |  |     id = db.Column(UUID, nullable=False, | 
					
						
							|  |  |  |                    server_default=db.text('uuid_generate_v4()')) | 
					
						
							|  |  |  |     tenant_id = db.Column(UUID, nullable=False) | 
					
						
							|  |  |  |     dataset_id = db.Column(UUID, nullable=False) | 
					
						
							|  |  |  |     position = db.Column(db.Integer, nullable=False) | 
					
						
							|  |  |  |     data_source_type = db.Column(db.String(255), nullable=False) | 
					
						
							|  |  |  |     data_source_info = db.Column(db.Text, nullable=True) | 
					
						
							|  |  |  |     dataset_process_rule_id = db.Column(UUID, nullable=True) | 
					
						
							|  |  |  |     batch = db.Column(db.String(255), nullable=False) | 
					
						
							|  |  |  |     name = db.Column(db.String(255), nullable=False) | 
					
						
							|  |  |  |     created_from = db.Column(db.String(255), nullable=False) | 
					
						
							|  |  |  |     created_by = db.Column(UUID, nullable=False) | 
					
						
							|  |  |  |     created_api_request_id = db.Column(UUID, nullable=True) | 
					
						
							|  |  |  |     created_at = db.Column(db.DateTime, nullable=False, | 
					
						
							|  |  |  |                            server_default=db.text('CURRENT_TIMESTAMP(0)')) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # start processing | 
					
						
							|  |  |  |     processing_started_at = db.Column(db.DateTime, nullable=True) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # parsing | 
					
						
							|  |  |  |     file_id = db.Column(db.Text, nullable=True) | 
					
						
							|  |  |  |     word_count = db.Column(db.Integer, nullable=True) | 
					
						
							|  |  |  |     parsing_completed_at = db.Column(db.DateTime, nullable=True) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # cleaning | 
					
						
							|  |  |  |     cleaning_completed_at = db.Column(db.DateTime, nullable=True) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # split | 
					
						
							|  |  |  |     splitting_completed_at = db.Column(db.DateTime, nullable=True) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # indexing | 
					
						
							|  |  |  |     tokens = db.Column(db.Integer, nullable=True) | 
					
						
							|  |  |  |     indexing_latency = db.Column(db.Float, nullable=True) | 
					
						
							|  |  |  |     completed_at = db.Column(db.DateTime, nullable=True) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # pause | 
					
						
							|  |  |  |     is_paused = db.Column(db.Boolean, nullable=True, server_default=db.text('false')) | 
					
						
							|  |  |  |     paused_by = db.Column(UUID, nullable=True) | 
					
						
							|  |  |  |     paused_at = db.Column(db.DateTime, nullable=True) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # error | 
					
						
							|  |  |  |     error = db.Column(db.Text, nullable=True) | 
					
						
							|  |  |  |     stopped_at = db.Column(db.DateTime, nullable=True) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # basic fields | 
					
						
							|  |  |  |     indexing_status = db.Column(db.String( | 
					
						
							|  |  |  |         255), nullable=False, server_default=db.text("'waiting'::character varying")) | 
					
						
							|  |  |  |     enabled = db.Column(db.Boolean, nullable=False, | 
					
						
							|  |  |  |                         server_default=db.text('true')) | 
					
						
							|  |  |  |     disabled_at = db.Column(db.DateTime, nullable=True) | 
					
						
							|  |  |  |     disabled_by = db.Column(UUID, nullable=True) | 
					
						
							|  |  |  |     archived = db.Column(db.Boolean, nullable=False, | 
					
						
							|  |  |  |                          server_default=db.text('false')) | 
					
						
							|  |  |  |     archived_reason = db.Column(db.String(255), nullable=True) | 
					
						
							|  |  |  |     archived_by = db.Column(UUID, nullable=True) | 
					
						
							|  |  |  |     archived_at = db.Column(db.DateTime, nullable=True) | 
					
						
							|  |  |  |     updated_at = db.Column(db.DateTime, nullable=False, | 
					
						
							|  |  |  |                            server_default=db.text('CURRENT_TIMESTAMP(0)')) | 
					
						
							|  |  |  |     doc_type = db.Column(db.String(40), nullable=True) | 
					
						
							|  |  |  |     doc_metadata = db.Column(db.JSON, nullable=True) | 
					
						
							| 
									
										
										
										
											2023-07-28 20:47:15 +08:00
										 |  |  |     doc_form = db.Column(db.String( | 
					
						
							|  |  |  |         255), nullable=False, server_default=db.text("'text_model'::character varying")) | 
					
						
							| 
									
										
										
										
											2023-08-18 17:37:31 +08:00
										 |  |  |     doc_language = db.Column(db.String(255), nullable=True) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-06-16 21:47:51 +08:00
										 |  |  |     DATA_SOURCES = ['upload_file', 'notion_import'] | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def display_status(self): | 
					
						
							|  |  |  |         status = None | 
					
						
							|  |  |  |         if self.indexing_status == 'waiting': | 
					
						
							|  |  |  |             status = 'queuing' | 
					
						
							|  |  |  |         elif self.indexing_status not in ['completed', 'error', 'waiting'] and self.is_paused: | 
					
						
							|  |  |  |             status = 'paused' | 
					
						
							|  |  |  |         elif self.indexing_status in ['parsing', 'cleaning', 'splitting', 'indexing']: | 
					
						
							|  |  |  |             status = 'indexing' | 
					
						
							|  |  |  |         elif self.indexing_status == 'error': | 
					
						
							|  |  |  |             status = 'error' | 
					
						
							|  |  |  |         elif self.indexing_status == 'completed' and not self.archived and self.enabled: | 
					
						
							|  |  |  |             status = 'available' | 
					
						
							|  |  |  |         elif self.indexing_status == 'completed' and not self.archived and not self.enabled: | 
					
						
							|  |  |  |             status = 'disabled' | 
					
						
							|  |  |  |         elif self.indexing_status == 'completed' and self.archived: | 
					
						
							|  |  |  |             status = 'archived' | 
					
						
							|  |  |  |         return status | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def data_source_info_dict(self): | 
					
						
							|  |  |  |         if self.data_source_info: | 
					
						
							|  |  |  |             try: | 
					
						
							|  |  |  |                 data_source_info_dict = json.loads(self.data_source_info) | 
					
						
							|  |  |  |             except JSONDecodeError: | 
					
						
							|  |  |  |                 data_source_info_dict = {} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             return data_source_info_dict | 
					
						
							|  |  |  |         return None | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def data_source_detail_dict(self): | 
					
						
							|  |  |  |         if self.data_source_info: | 
					
						
							|  |  |  |             if self.data_source_type == 'upload_file': | 
					
						
							|  |  |  |                 data_source_info_dict = json.loads(self.data_source_info) | 
					
						
							|  |  |  |                 file_detail = db.session.query(UploadFile). \ | 
					
						
							|  |  |  |                     filter(UploadFile.id == data_source_info_dict['upload_file_id']). \ | 
					
						
							|  |  |  |                     one_or_none() | 
					
						
							|  |  |  |                 if file_detail: | 
					
						
							|  |  |  |                     return { | 
					
						
							|  |  |  |                         'upload_file': { | 
					
						
							|  |  |  |                             'id': file_detail.id, | 
					
						
							|  |  |  |                             'name': file_detail.name, | 
					
						
							|  |  |  |                             'size': file_detail.size, | 
					
						
							|  |  |  |                             'extension': file_detail.extension, | 
					
						
							|  |  |  |                             'mime_type': file_detail.mime_type, | 
					
						
							|  |  |  |                             'created_by': file_detail.created_by, | 
					
						
							|  |  |  |                             'created_at': file_detail.created_at.timestamp() | 
					
						
							|  |  |  |                         } | 
					
						
							|  |  |  |                     } | 
					
						
							| 
									
										
										
										
											2023-06-16 21:47:51 +08:00
										 |  |  |             elif self.data_source_type == 'notion_import': | 
					
						
							|  |  |  |                 return json.loads(self.data_source_info) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |         return {} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def average_segment_length(self): | 
					
						
							|  |  |  |         if self.word_count and self.word_count != 0 and self.segment_count and self.segment_count != 0: | 
					
						
							| 
									
										
										
										
											2023-08-12 00:57:00 +08:00
										 |  |  |             return self.word_count // self.segment_count | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |         return 0 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def dataset_process_rule(self): | 
					
						
							|  |  |  |         if self.dataset_process_rule_id: | 
					
						
							|  |  |  |             return DatasetProcessRule.query.get(self.dataset_process_rule_id) | 
					
						
							|  |  |  |         return None | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def dataset(self): | 
					
						
							| 
									
										
										
										
											2023-06-25 16:49:14 +08:00
										 |  |  |         return db.session.query(Dataset).filter(Dataset.id == self.dataset_id).one_or_none() | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def segment_count(self): | 
					
						
							|  |  |  |         return DocumentSegment.query.filter(DocumentSegment.document_id == self.id).count() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def hit_count(self): | 
					
						
							|  |  |  |         return DocumentSegment.query.with_entities(func.coalesce(func.sum(DocumentSegment.hit_count))) \ | 
					
						
							|  |  |  |             .filter(DocumentSegment.document_id == self.id).scalar() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class DocumentSegment(db.Model): | 
					
						
							|  |  |  |     __tablename__ = 'document_segments' | 
					
						
							|  |  |  |     __table_args__ = ( | 
					
						
							|  |  |  |         db.PrimaryKeyConstraint('id', name='document_segment_pkey'), | 
					
						
							|  |  |  |         db.Index('document_segment_dataset_id_idx', 'dataset_id'), | 
					
						
							|  |  |  |         db.Index('document_segment_document_id_idx', 'document_id'), | 
					
						
							|  |  |  |         db.Index('document_segment_tenant_dataset_idx', 'dataset_id', 'tenant_id'), | 
					
						
							|  |  |  |         db.Index('document_segment_tenant_document_idx', 'document_id', 'tenant_id'), | 
					
						
							|  |  |  |         db.Index('document_segment_dataset_node_idx', 'dataset_id', 'index_node_id'), | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # initial fields | 
					
						
							|  |  |  |     id = db.Column(UUID, nullable=False, | 
					
						
							|  |  |  |                    server_default=db.text('uuid_generate_v4()')) | 
					
						
							|  |  |  |     tenant_id = db.Column(UUID, nullable=False) | 
					
						
							|  |  |  |     dataset_id = db.Column(UUID, nullable=False) | 
					
						
							|  |  |  |     document_id = db.Column(UUID, nullable=False) | 
					
						
							|  |  |  |     position = db.Column(db.Integer, nullable=False) | 
					
						
							|  |  |  |     content = db.Column(db.Text, nullable=False) | 
					
						
							| 
									
										
										
										
											2023-07-28 20:47:15 +08:00
										 |  |  |     answer = db.Column(db.Text, nullable=True) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     word_count = db.Column(db.Integer, nullable=False) | 
					
						
							|  |  |  |     tokens = db.Column(db.Integer, nullable=False) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # indexing fields | 
					
						
							|  |  |  |     keywords = db.Column(db.JSON, nullable=True) | 
					
						
							|  |  |  |     index_node_id = db.Column(db.String(255), nullable=True) | 
					
						
							|  |  |  |     index_node_hash = db.Column(db.String(255), nullable=True) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # basic fields | 
					
						
							|  |  |  |     hit_count = db.Column(db.Integer, nullable=False, default=0) | 
					
						
							|  |  |  |     enabled = db.Column(db.Boolean, nullable=False, | 
					
						
							|  |  |  |                         server_default=db.text('true')) | 
					
						
							|  |  |  |     disabled_at = db.Column(db.DateTime, nullable=True) | 
					
						
							|  |  |  |     disabled_by = db.Column(UUID, nullable=True) | 
					
						
							|  |  |  |     status = db.Column(db.String(255), nullable=False, | 
					
						
							|  |  |  |                        server_default=db.text("'waiting'::character varying")) | 
					
						
							|  |  |  |     created_by = db.Column(UUID, nullable=False) | 
					
						
							|  |  |  |     created_at = db.Column(db.DateTime, nullable=False, | 
					
						
							|  |  |  |                            server_default=db.text('CURRENT_TIMESTAMP(0)')) | 
					
						
							| 
									
										
										
										
											2023-07-28 20:47:15 +08:00
										 |  |  |     updated_by = db.Column(UUID, nullable=True) | 
					
						
							|  |  |  |     updated_at = db.Column(db.DateTime, nullable=False, | 
					
						
							|  |  |  |                            server_default=db.text('CURRENT_TIMESTAMP(0)')) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     indexing_at = db.Column(db.DateTime, nullable=True) | 
					
						
							|  |  |  |     completed_at = db.Column(db.DateTime, nullable=True) | 
					
						
							|  |  |  |     error = db.Column(db.Text, nullable=True) | 
					
						
							|  |  |  |     stopped_at = db.Column(db.DateTime, nullable=True) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def dataset(self): | 
					
						
							|  |  |  |         return db.session.query(Dataset).filter(Dataset.id == self.dataset_id).first() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def document(self): | 
					
						
							|  |  |  |         return db.session.query(Document).filter(Document.id == self.document_id).first() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def previous_segment(self): | 
					
						
							|  |  |  |         return db.session.query(DocumentSegment).filter( | 
					
						
							|  |  |  |             DocumentSegment.document_id == self.document_id, | 
					
						
							|  |  |  |             DocumentSegment.position == self.position - 1 | 
					
						
							|  |  |  |         ).first() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def next_segment(self): | 
					
						
							|  |  |  |         return db.session.query(DocumentSegment).filter( | 
					
						
							|  |  |  |             DocumentSegment.document_id == self.document_id, | 
					
						
							|  |  |  |             DocumentSegment.position == self.position + 1 | 
					
						
							|  |  |  |         ).first() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class AppDatasetJoin(db.Model): | 
					
						
							|  |  |  |     __tablename__ = 'app_dataset_joins' | 
					
						
							|  |  |  |     __table_args__ = ( | 
					
						
							|  |  |  |         db.PrimaryKeyConstraint('id', name='app_dataset_join_pkey'), | 
					
						
							|  |  |  |         db.Index('app_dataset_join_app_dataset_idx', 'dataset_id', 'app_id'), | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     id = db.Column(UUID, primary_key=True, nullable=False, server_default=db.text('uuid_generate_v4()')) | 
					
						
							|  |  |  |     app_id = db.Column(UUID, nullable=False) | 
					
						
							|  |  |  |     dataset_id = db.Column(UUID, nullable=False) | 
					
						
							|  |  |  |     created_at = db.Column(db.DateTime, nullable=False, server_default=db.func.current_timestamp()) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def app(self): | 
					
						
							|  |  |  |         return App.query.get(self.app_id) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class DatasetQuery(db.Model): | 
					
						
							|  |  |  |     __tablename__ = 'dataset_queries' | 
					
						
							|  |  |  |     __table_args__ = ( | 
					
						
							|  |  |  |         db.PrimaryKeyConstraint('id', name='dataset_query_pkey'), | 
					
						
							|  |  |  |         db.Index('dataset_query_dataset_id_idx', 'dataset_id'), | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     id = db.Column(UUID, primary_key=True, nullable=False, server_default=db.text('uuid_generate_v4()')) | 
					
						
							|  |  |  |     dataset_id = db.Column(UUID, nullable=False) | 
					
						
							|  |  |  |     content = db.Column(db.Text, nullable=False) | 
					
						
							|  |  |  |     source = db.Column(db.String(255), nullable=False) | 
					
						
							|  |  |  |     source_app_id = db.Column(UUID, nullable=True) | 
					
						
							|  |  |  |     created_by_role = db.Column(db.String, nullable=False) | 
					
						
							|  |  |  |     created_by = db.Column(UUID, nullable=False) | 
					
						
							|  |  |  |     created_at = db.Column(db.DateTime, nullable=False, server_default=db.func.current_timestamp()) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class DatasetKeywordTable(db.Model): | 
					
						
							|  |  |  |     __tablename__ = 'dataset_keyword_tables' | 
					
						
							|  |  |  |     __table_args__ = ( | 
					
						
							|  |  |  |         db.PrimaryKeyConstraint('id', name='dataset_keyword_table_pkey'), | 
					
						
							|  |  |  |         db.Index('dataset_keyword_table_dataset_id_idx', 'dataset_id'), | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     id = db.Column(UUID, primary_key=True, server_default=db.text('uuid_generate_v4()')) | 
					
						
							|  |  |  |     dataset_id = db.Column(UUID, nullable=False, unique=True) | 
					
						
							|  |  |  |     keyword_table = db.Column(db.Text, nullable=False) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def keyword_table_dict(self): | 
					
						
							| 
									
										
										
										
											2023-06-25 16:49:14 +08:00
										 |  |  |         class SetDecoder(json.JSONDecoder): | 
					
						
							|  |  |  |             def __init__(self, *args, **kwargs): | 
					
						
							|  |  |  |                 super().__init__(object_hook=self.object_hook, *args, **kwargs) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             def object_hook(self, dct): | 
					
						
							|  |  |  |                 if isinstance(dct, dict): | 
					
						
							|  |  |  |                     for keyword, node_idxs in dct.items(): | 
					
						
							|  |  |  |                         if isinstance(node_idxs, list): | 
					
						
							|  |  |  |                             dct[keyword] = set(node_idxs) | 
					
						
							|  |  |  |                 return dct | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         return json.loads(self.keyword_table, cls=SetDecoder) if self.keyword_table else None | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class Embedding(db.Model): | 
					
						
							|  |  |  |     __tablename__ = 'embeddings' | 
					
						
							|  |  |  |     __table_args__ = ( | 
					
						
							|  |  |  |         db.PrimaryKeyConstraint('id', name='embedding_pkey'), | 
					
						
							| 
									
										
										
										
											2023-08-12 00:57:00 +08:00
										 |  |  |         db.UniqueConstraint('model_name', 'hash', name='embedding_hash_idx') | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     id = db.Column(UUID, primary_key=True, server_default=db.text('uuid_generate_v4()')) | 
					
						
							| 
									
										
										
										
											2023-08-12 00:57:00 +08:00
										 |  |  |     model_name = db.Column(db.String(40), nullable=False, | 
					
						
							|  |  |  |                            server_default=db.text("'text-embedding-ada-002'::character varying")) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     hash = db.Column(db.String(64), nullable=False) | 
					
						
							|  |  |  |     embedding = db.Column(db.LargeBinary, nullable=False) | 
					
						
							|  |  |  |     created_at = db.Column(db.DateTime, nullable=False, server_default=db.text('CURRENT_TIMESTAMP(0)')) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def set_embedding(self, embedding_data: list[float]): | 
					
						
							|  |  |  |         self.embedding = pickle.dumps(embedding_data, protocol=pickle.HIGHEST_PROTOCOL) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def get_embedding(self) -> list[float]: | 
					
						
							| 
									
										
										
										
											2023-08-12 00:57:00 +08:00
										 |  |  |         return pickle.loads(self.embedding) | 
					
						
							| 
									
										
										
										
											2023-09-18 18:15:41 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class DatasetCollectionBinding(db.Model): | 
					
						
							|  |  |  |     __tablename__ = 'dataset_collection_bindings' | 
					
						
							|  |  |  |     __table_args__ = ( | 
					
						
							|  |  |  |         db.PrimaryKeyConstraint('id', name='dataset_collection_bindings_pkey'), | 
					
						
							|  |  |  |         db.Index('provider_model_name_idx', 'provider_name', 'model_name') | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     id = db.Column(UUID, primary_key=True, server_default=db.text('uuid_generate_v4()')) | 
					
						
							|  |  |  |     provider_name = db.Column(db.String(40), nullable=False) | 
					
						
							|  |  |  |     model_name = db.Column(db.String(40), nullable=False) | 
					
						
							| 
									
										
										
										
											2023-12-18 13:10:05 +08:00
										 |  |  |     type = db.Column(db.String(40), server_default=db.text("'dataset'::character varying"), nullable=False) | 
					
						
							| 
									
										
										
										
											2023-09-18 18:15:41 +08:00
										 |  |  |     collection_name = db.Column(db.String(64), nullable=False) | 
					
						
							|  |  |  |     created_at = db.Column(db.DateTime, nullable=False, server_default=db.text('CURRENT_TIMESTAMP(0)')) |