| 
									
										
										
										
											2024-05-23 18:05:23 +08:00
										 |  |  | import base64 | 
					
						
							| 
									
										
										
										
											2024-08-21 20:25:45 +08:00
										 |  |  | import enum | 
					
						
							| 
									
										
										
										
											2024-05-23 18:05:23 +08:00
										 |  |  | import hashlib | 
					
						
							|  |  |  | import hmac | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | import json | 
					
						
							| 
									
										
										
										
											2024-04-01 20:19:30 +08:00
										 |  |  | import logging | 
					
						
							| 
									
										
										
										
											2024-05-23 18:05:23 +08:00
										 |  |  | import os | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | import pickle | 
					
						
							| 
									
										
										
										
											2024-05-23 18:05:23 +08:00
										 |  |  | import re | 
					
						
							|  |  |  | import time | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | from json import JSONDecodeError | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-06 13:21:13 +08:00
										 |  |  | from sqlalchemy import func | 
					
						
							| 
									
										
										
										
											2024-04-29 11:58:17 +08:00
										 |  |  | from sqlalchemy.dialects.postgresql import JSONB | 
					
						
							| 
									
										
										
										
											2024-02-06 13:21:13 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-07-17 22:26:18 +08:00
										 |  |  | from configs import dify_config | 
					
						
							| 
									
										
										
										
											2024-09-08 12:14:11 +07:00
										 |  |  | from core.rag.retrieval.retrieval_methods import RetrievalMethod | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | from extensions.ext_database import db | 
					
						
							| 
									
										
										
										
											2024-04-01 20:19:30 +08:00
										 |  |  | from extensions.ext_storage import storage | 
					
						
							| 
									
										
										
										
											2024-08-13 14:44:10 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | from .account import Account | 
					
						
							|  |  |  | from .model import App, Tag, TagBinding, UploadFile | 
					
						
							|  |  |  | from .types import StringUUID | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-12 00:57:00 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-08-21 20:25:45 +08:00
										 |  |  | class DatasetPermissionEnum(str, enum.Enum): | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |     ONLY_ME = "only_me" | 
					
						
							|  |  |  |     ALL_TEAM = "all_team_members" | 
					
						
							|  |  |  |     PARTIAL_TEAM = "partial_members" | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-08-21 20:25:45 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | class Dataset(db.Model): | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |     __tablename__ = "datasets" | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     __table_args__ = ( | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |         db.PrimaryKeyConstraint("id", name="dataset_pkey"), | 
					
						
							|  |  |  |         db.Index("dataset_tenant_idx", "tenant_id"), | 
					
						
							|  |  |  |         db.Index("retrieval_model_idx", "retrieval_model", postgresql_using="gin"), | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |     INDEXING_TECHNIQUE_LIST = ["high_quality", "economy", None] | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |     id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()")) | 
					
						
							| 
									
										
										
										
											2024-04-29 11:58:17 +08:00
										 |  |  |     tenant_id = db.Column(StringUUID, nullable=False) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     name = db.Column(db.String(255), nullable=False) | 
					
						
							|  |  |  |     description = db.Column(db.Text, nullable=True) | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |     provider = db.Column(db.String(255), nullable=False, server_default=db.text("'vendor'::character varying")) | 
					
						
							|  |  |  |     permission = db.Column(db.String(255), nullable=False, server_default=db.text("'only_me'::character varying")) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     data_source_type = db.Column(db.String(255)) | 
					
						
							|  |  |  |     indexing_technique = db.Column(db.String(255), nullable=True) | 
					
						
							|  |  |  |     index_struct = db.Column(db.Text, nullable=True) | 
					
						
							| 
									
										
										
										
											2024-04-29 11:58:17 +08:00
										 |  |  |     created_by = db.Column(StringUUID, nullable=False) | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |     created_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)")) | 
					
						
							| 
									
										
										
										
											2024-04-29 11:58:17 +08:00
										 |  |  |     updated_by = db.Column(StringUUID, nullable=True) | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |     updated_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)")) | 
					
						
							| 
									
										
										
										
											2023-08-29 03:37:45 +08:00
										 |  |  |     embedding_model = db.Column(db.String(255), nullable=True) | 
					
						
							|  |  |  |     embedding_model_provider = db.Column(db.String(255), nullable=True) | 
					
						
							| 
									
										
										
										
											2024-04-29 11:58:17 +08:00
										 |  |  |     collection_binding_id = db.Column(StringUUID, nullable=True) | 
					
						
							| 
									
										
										
										
											2023-11-17 22:13:37 +08:00
										 |  |  |     retrieval_model = db.Column(JSONB, nullable=True) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def dataset_keyword_table(self): | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |         dataset_keyword_table = ( | 
					
						
							|  |  |  |             db.session.query(DatasetKeywordTable).filter(DatasetKeywordTable.dataset_id == self.id).first() | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |         if dataset_keyword_table: | 
					
						
							|  |  |  |             return dataset_keyword_table | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         return None | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def index_struct_dict(self): | 
					
						
							|  |  |  |         return json.loads(self.index_struct) if self.index_struct else None | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def created_by_account(self): | 
					
						
							| 
									
										
										
										
											2024-07-17 13:54:35 +08:00
										 |  |  |         return db.session.get(Account, self.created_by) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def latest_process_rule(self): | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |         return ( | 
					
						
							|  |  |  |             DatasetProcessRule.query.filter(DatasetProcessRule.dataset_id == self.id) | 
					
						
							|  |  |  |             .order_by(DatasetProcessRule.created_at.desc()) | 
					
						
							|  |  |  |             .first() | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def app_count(self): | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |         return ( | 
					
						
							|  |  |  |             db.session.query(func.count(AppDatasetJoin.id)) | 
					
						
							|  |  |  |             .filter(AppDatasetJoin.dataset_id == self.id, App.id == AppDatasetJoin.app_id) | 
					
						
							|  |  |  |             .scalar() | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def document_count(self): | 
					
						
							|  |  |  |         return db.session.query(func.count(Document.id)).filter(Document.dataset_id == self.id).scalar() | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-06-25 16:49:14 +08:00
										 |  |  |     @property | 
					
						
							|  |  |  |     def available_document_count(self): | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |         return ( | 
					
						
							|  |  |  |             db.session.query(func.count(Document.id)) | 
					
						
							|  |  |  |             .filter( | 
					
						
							|  |  |  |                 Document.dataset_id == self.id, | 
					
						
							|  |  |  |                 Document.indexing_status == "completed", | 
					
						
							|  |  |  |                 Document.enabled == True, | 
					
						
							|  |  |  |                 Document.archived == False, | 
					
						
							|  |  |  |             ) | 
					
						
							|  |  |  |             .scalar() | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2023-06-25 16:49:14 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def available_segment_count(self): | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |         return ( | 
					
						
							|  |  |  |             db.session.query(func.count(DocumentSegment.id)) | 
					
						
							|  |  |  |             .filter( | 
					
						
							|  |  |  |                 DocumentSegment.dataset_id == self.id, | 
					
						
							|  |  |  |                 DocumentSegment.status == "completed", | 
					
						
							|  |  |  |                 DocumentSegment.enabled == True, | 
					
						
							|  |  |  |             ) | 
					
						
							|  |  |  |             .scalar() | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2023-06-25 16:49:14 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     @property | 
					
						
							|  |  |  |     def word_count(self): | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |         return ( | 
					
						
							|  |  |  |             Document.query.with_entities(func.coalesce(func.sum(Document.word_count))) | 
					
						
							|  |  |  |             .filter(Document.dataset_id == self.id) | 
					
						
							|  |  |  |             .scalar() | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-22 23:31:57 +08:00
										 |  |  |     @property | 
					
						
							|  |  |  |     def doc_form(self): | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |         document = db.session.query(Document).filter(Document.dataset_id == self.id).first() | 
					
						
							| 
									
										
										
										
											2024-02-22 23:31:57 +08:00
										 |  |  |         if document: | 
					
						
							|  |  |  |             return document.doc_form | 
					
						
							|  |  |  |         return None | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-11-17 22:13:37 +08:00
										 |  |  |     @property | 
					
						
							|  |  |  |     def retrieval_model_dict(self): | 
					
						
							|  |  |  |         default_retrieval_model = { | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |             "search_method": RetrievalMethod.SEMANTIC_SEARCH.value, | 
					
						
							|  |  |  |             "reranking_enable": False, | 
					
						
							|  |  |  |             "reranking_model": {"reranking_provider_name": "", "reranking_model_name": ""}, | 
					
						
							|  |  |  |             "top_k": 2, | 
					
						
							|  |  |  |             "score_threshold_enabled": False, | 
					
						
							| 
									
										
										
										
											2023-11-17 22:13:37 +08:00
										 |  |  |         } | 
					
						
							|  |  |  |         return self.retrieval_model if self.retrieval_model else default_retrieval_model | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-24 15:02:29 +08:00
										 |  |  |     @property | 
					
						
							|  |  |  |     def tags(self): | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |         tags = ( | 
					
						
							|  |  |  |             db.session.query(Tag) | 
					
						
							|  |  |  |             .join(TagBinding, Tag.id == TagBinding.tag_id) | 
					
						
							|  |  |  |             .filter( | 
					
						
							|  |  |  |                 TagBinding.target_id == self.id, | 
					
						
							|  |  |  |                 TagBinding.tenant_id == self.tenant_id, | 
					
						
							|  |  |  |                 Tag.tenant_id == self.tenant_id, | 
					
						
							|  |  |  |                 Tag.type == "knowledge", | 
					
						
							|  |  |  |             ) | 
					
						
							|  |  |  |             .all() | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2024-04-24 15:02:29 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |         return tags if tags else [] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-29 12:47:10 +08:00
										 |  |  |     @staticmethod | 
					
						
							|  |  |  |     def gen_collection_name_by_id(dataset_id: str) -> str: | 
					
						
							|  |  |  |         normalized_dataset_id = dataset_id.replace("-", "_") | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |         return f"Vector_index_{normalized_dataset_id}_Node" | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-02 20:46:24 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | class DatasetProcessRule(db.Model): | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |     __tablename__ = "dataset_process_rules" | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     __table_args__ = ( | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |         db.PrimaryKeyConstraint("id", name="dataset_process_rule_pkey"), | 
					
						
							|  |  |  |         db.Index("dataset_process_rule_dataset_id_idx", "dataset_id"), | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |     id = db.Column(StringUUID, nullable=False, server_default=db.text("uuid_generate_v4()")) | 
					
						
							| 
									
										
										
										
											2024-04-29 11:58:17 +08:00
										 |  |  |     dataset_id = db.Column(StringUUID, nullable=False) | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |     mode = db.Column(db.String(255), nullable=False, server_default=db.text("'automatic'::character varying")) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     rules = db.Column(db.Text, nullable=True) | 
					
						
							| 
									
										
										
										
											2024-04-29 11:58:17 +08:00
										 |  |  |     created_by = db.Column(StringUUID, nullable=False) | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |     created_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)")) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |     MODES = ["automatic", "custom"] | 
					
						
							|  |  |  |     PRE_PROCESSING_RULES = ["remove_stopwords", "remove_extra_spaces", "remove_urls_emails"] | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     AUTOMATIC_RULES = { | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |         "pre_processing_rules": [ | 
					
						
							|  |  |  |             {"id": "remove_extra_spaces", "enabled": True}, | 
					
						
							|  |  |  |             {"id": "remove_urls_emails", "enabled": False}, | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |         ], | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |         "segmentation": {"delimiter": "\n", "max_tokens": 500, "chunk_overlap": 50}, | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def to_dict(self): | 
					
						
							|  |  |  |         return { | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |             "id": self.id, | 
					
						
							|  |  |  |             "dataset_id": self.dataset_id, | 
					
						
							|  |  |  |             "mode": self.mode, | 
					
						
							|  |  |  |             "rules": self.rules_dict, | 
					
						
							|  |  |  |             "created_by": self.created_by, | 
					
						
							|  |  |  |             "created_at": self.created_at, | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def rules_dict(self): | 
					
						
							|  |  |  |         try: | 
					
						
							|  |  |  |             return json.loads(self.rules) if self.rules else None | 
					
						
							|  |  |  |         except JSONDecodeError: | 
					
						
							|  |  |  |             return None | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class Document(db.Model): | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |     __tablename__ = "documents" | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     __table_args__ = ( | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |         db.PrimaryKeyConstraint("id", name="document_pkey"), | 
					
						
							|  |  |  |         db.Index("document_dataset_id_idx", "dataset_id"), | 
					
						
							|  |  |  |         db.Index("document_is_paused_idx", "is_paused"), | 
					
						
							|  |  |  |         db.Index("document_tenant_idx", "tenant_id"), | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # initial fields | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |     id = db.Column(StringUUID, nullable=False, server_default=db.text("uuid_generate_v4()")) | 
					
						
							| 
									
										
										
										
											2024-04-29 11:58:17 +08:00
										 |  |  |     tenant_id = db.Column(StringUUID, nullable=False) | 
					
						
							|  |  |  |     dataset_id = db.Column(StringUUID, nullable=False) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     position = db.Column(db.Integer, nullable=False) | 
					
						
							|  |  |  |     data_source_type = db.Column(db.String(255), nullable=False) | 
					
						
							|  |  |  |     data_source_info = db.Column(db.Text, nullable=True) | 
					
						
							| 
									
										
										
										
											2024-04-29 11:58:17 +08:00
										 |  |  |     dataset_process_rule_id = db.Column(StringUUID, nullable=True) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     batch = db.Column(db.String(255), nullable=False) | 
					
						
							|  |  |  |     name = db.Column(db.String(255), nullable=False) | 
					
						
							|  |  |  |     created_from = db.Column(db.String(255), nullable=False) | 
					
						
							| 
									
										
										
										
											2024-04-29 11:58:17 +08:00
										 |  |  |     created_by = db.Column(StringUUID, nullable=False) | 
					
						
							|  |  |  |     created_api_request_id = db.Column(StringUUID, nullable=True) | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |     created_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)")) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # start processing | 
					
						
							|  |  |  |     processing_started_at = db.Column(db.DateTime, nullable=True) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # parsing | 
					
						
							|  |  |  |     file_id = db.Column(db.Text, nullable=True) | 
					
						
							|  |  |  |     word_count = db.Column(db.Integer, nullable=True) | 
					
						
							|  |  |  |     parsing_completed_at = db.Column(db.DateTime, nullable=True) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # cleaning | 
					
						
							|  |  |  |     cleaning_completed_at = db.Column(db.DateTime, nullable=True) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # split | 
					
						
							|  |  |  |     splitting_completed_at = db.Column(db.DateTime, nullable=True) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # indexing | 
					
						
							|  |  |  |     tokens = db.Column(db.Integer, nullable=True) | 
					
						
							|  |  |  |     indexing_latency = db.Column(db.Float, nullable=True) | 
					
						
							|  |  |  |     completed_at = db.Column(db.DateTime, nullable=True) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # pause | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |     is_paused = db.Column(db.Boolean, nullable=True, server_default=db.text("false")) | 
					
						
							| 
									
										
										
										
											2024-04-29 11:58:17 +08:00
										 |  |  |     paused_by = db.Column(StringUUID, nullable=True) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     paused_at = db.Column(db.DateTime, nullable=True) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # error | 
					
						
							|  |  |  |     error = db.Column(db.Text, nullable=True) | 
					
						
							|  |  |  |     stopped_at = db.Column(db.DateTime, nullable=True) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # basic fields | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |     indexing_status = db.Column(db.String(255), nullable=False, server_default=db.text("'waiting'::character varying")) | 
					
						
							|  |  |  |     enabled = db.Column(db.Boolean, nullable=False, server_default=db.text("true")) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     disabled_at = db.Column(db.DateTime, nullable=True) | 
					
						
							| 
									
										
										
										
											2024-04-29 11:58:17 +08:00
										 |  |  |     disabled_by = db.Column(StringUUID, nullable=True) | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |     archived = db.Column(db.Boolean, nullable=False, server_default=db.text("false")) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     archived_reason = db.Column(db.String(255), nullable=True) | 
					
						
							| 
									
										
										
										
											2024-04-29 11:58:17 +08:00
										 |  |  |     archived_by = db.Column(StringUUID, nullable=True) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     archived_at = db.Column(db.DateTime, nullable=True) | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |     updated_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)")) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     doc_type = db.Column(db.String(40), nullable=True) | 
					
						
							|  |  |  |     doc_metadata = db.Column(db.JSON, nullable=True) | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |     doc_form = db.Column(db.String(255), nullable=False, server_default=db.text("'text_model'::character varying")) | 
					
						
							| 
									
										
										
										
											2023-08-18 17:37:31 +08:00
										 |  |  |     doc_language = db.Column(db.String(255), nullable=True) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |     DATA_SOURCES = ["upload_file", "notion_import", "website_crawl"] | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def display_status(self): | 
					
						
							|  |  |  |         status = None | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |         if self.indexing_status == "waiting": | 
					
						
							|  |  |  |             status = "queuing" | 
					
						
							|  |  |  |         elif self.indexing_status not in ["completed", "error", "waiting"] and self.is_paused: | 
					
						
							|  |  |  |             status = "paused" | 
					
						
							|  |  |  |         elif self.indexing_status in ["parsing", "cleaning", "splitting", "indexing"]: | 
					
						
							|  |  |  |             status = "indexing" | 
					
						
							|  |  |  |         elif self.indexing_status == "error": | 
					
						
							|  |  |  |             status = "error" | 
					
						
							|  |  |  |         elif self.indexing_status == "completed" and not self.archived and self.enabled: | 
					
						
							|  |  |  |             status = "available" | 
					
						
							|  |  |  |         elif self.indexing_status == "completed" and not self.archived and not self.enabled: | 
					
						
							|  |  |  |             status = "disabled" | 
					
						
							|  |  |  |         elif self.indexing_status == "completed" and self.archived: | 
					
						
							|  |  |  |             status = "archived" | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |         return status | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def data_source_info_dict(self): | 
					
						
							|  |  |  |         if self.data_source_info: | 
					
						
							|  |  |  |             try: | 
					
						
							|  |  |  |                 data_source_info_dict = json.loads(self.data_source_info) | 
					
						
							|  |  |  |             except JSONDecodeError: | 
					
						
							|  |  |  |                 data_source_info_dict = {} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             return data_source_info_dict | 
					
						
							|  |  |  |         return None | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def data_source_detail_dict(self): | 
					
						
							|  |  |  |         if self.data_source_info: | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |             if self.data_source_type == "upload_file": | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |                 data_source_info_dict = json.loads(self.data_source_info) | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |                 file_detail = ( | 
					
						
							|  |  |  |                     db.session.query(UploadFile) | 
					
						
							|  |  |  |                     .filter(UploadFile.id == data_source_info_dict["upload_file_id"]) | 
					
						
							|  |  |  |                     .one_or_none() | 
					
						
							|  |  |  |                 ) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |                 if file_detail: | 
					
						
							|  |  |  |                     return { | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |                         "upload_file": { | 
					
						
							|  |  |  |                             "id": file_detail.id, | 
					
						
							|  |  |  |                             "name": file_detail.name, | 
					
						
							|  |  |  |                             "size": file_detail.size, | 
					
						
							|  |  |  |                             "extension": file_detail.extension, | 
					
						
							|  |  |  |                             "mime_type": file_detail.mime_type, | 
					
						
							|  |  |  |                             "created_by": file_detail.created_by, | 
					
						
							|  |  |  |                             "created_at": file_detail.created_at.timestamp(), | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |                         } | 
					
						
							|  |  |  |                     } | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |             elif self.data_source_type == "notion_import" or self.data_source_type == "website_crawl": | 
					
						
							| 
									
										
										
										
											2023-06-16 21:47:51 +08:00
										 |  |  |                 return json.loads(self.data_source_info) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |         return {} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def average_segment_length(self): | 
					
						
							|  |  |  |         if self.word_count and self.word_count != 0 and self.segment_count and self.segment_count != 0: | 
					
						
							| 
									
										
										
										
											2023-08-12 00:57:00 +08:00
										 |  |  |             return self.word_count // self.segment_count | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |         return 0 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def dataset_process_rule(self): | 
					
						
							|  |  |  |         if self.dataset_process_rule_id: | 
					
						
							| 
									
										
										
										
											2024-07-17 13:54:35 +08:00
										 |  |  |             return db.session.get(DatasetProcessRule, self.dataset_process_rule_id) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |         return None | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def dataset(self): | 
					
						
							| 
									
										
										
										
											2023-06-25 16:49:14 +08:00
										 |  |  |         return db.session.query(Dataset).filter(Dataset.id == self.dataset_id).one_or_none() | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def segment_count(self): | 
					
						
							|  |  |  |         return DocumentSegment.query.filter(DocumentSegment.document_id == self.id).count() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def hit_count(self): | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |         return ( | 
					
						
							|  |  |  |             DocumentSegment.query.with_entities(func.coalesce(func.sum(DocumentSegment.hit_count))) | 
					
						
							|  |  |  |             .filter(DocumentSegment.document_id == self.id) | 
					
						
							|  |  |  |             .scalar() | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-28 00:24:37 +08:00
										 |  |  |     def to_dict(self): | 
					
						
							|  |  |  |         return { | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |             "id": self.id, | 
					
						
							|  |  |  |             "tenant_id": self.tenant_id, | 
					
						
							|  |  |  |             "dataset_id": self.dataset_id, | 
					
						
							|  |  |  |             "position": self.position, | 
					
						
							|  |  |  |             "data_source_type": self.data_source_type, | 
					
						
							|  |  |  |             "data_source_info": self.data_source_info, | 
					
						
							|  |  |  |             "dataset_process_rule_id": self.dataset_process_rule_id, | 
					
						
							|  |  |  |             "batch": self.batch, | 
					
						
							|  |  |  |             "name": self.name, | 
					
						
							|  |  |  |             "created_from": self.created_from, | 
					
						
							|  |  |  |             "created_by": self.created_by, | 
					
						
							|  |  |  |             "created_api_request_id": self.created_api_request_id, | 
					
						
							|  |  |  |             "created_at": self.created_at, | 
					
						
							|  |  |  |             "processing_started_at": self.processing_started_at, | 
					
						
							|  |  |  |             "file_id": self.file_id, | 
					
						
							|  |  |  |             "word_count": self.word_count, | 
					
						
							|  |  |  |             "parsing_completed_at": self.parsing_completed_at, | 
					
						
							|  |  |  |             "cleaning_completed_at": self.cleaning_completed_at, | 
					
						
							|  |  |  |             "splitting_completed_at": self.splitting_completed_at, | 
					
						
							|  |  |  |             "tokens": self.tokens, | 
					
						
							|  |  |  |             "indexing_latency": self.indexing_latency, | 
					
						
							|  |  |  |             "completed_at": self.completed_at, | 
					
						
							|  |  |  |             "is_paused": self.is_paused, | 
					
						
							|  |  |  |             "paused_by": self.paused_by, | 
					
						
							|  |  |  |             "paused_at": self.paused_at, | 
					
						
							|  |  |  |             "error": self.error, | 
					
						
							|  |  |  |             "stopped_at": self.stopped_at, | 
					
						
							|  |  |  |             "indexing_status": self.indexing_status, | 
					
						
							|  |  |  |             "enabled": self.enabled, | 
					
						
							|  |  |  |             "disabled_at": self.disabled_at, | 
					
						
							|  |  |  |             "disabled_by": self.disabled_by, | 
					
						
							|  |  |  |             "archived": self.archived, | 
					
						
							|  |  |  |             "archived_reason": self.archived_reason, | 
					
						
							|  |  |  |             "archived_by": self.archived_by, | 
					
						
							|  |  |  |             "archived_at": self.archived_at, | 
					
						
							|  |  |  |             "updated_at": self.updated_at, | 
					
						
							|  |  |  |             "doc_type": self.doc_type, | 
					
						
							|  |  |  |             "doc_metadata": self.doc_metadata, | 
					
						
							|  |  |  |             "doc_form": self.doc_form, | 
					
						
							|  |  |  |             "doc_language": self.doc_language, | 
					
						
							|  |  |  |             "display_status": self.display_status, | 
					
						
							|  |  |  |             "data_source_info_dict": self.data_source_info_dict, | 
					
						
							|  |  |  |             "average_segment_length": self.average_segment_length, | 
					
						
							|  |  |  |             "dataset_process_rule": self.dataset_process_rule.to_dict() if self.dataset_process_rule else None, | 
					
						
							|  |  |  |             "dataset": self.dataset.to_dict() if self.dataset else None, | 
					
						
							|  |  |  |             "segment_count": self.segment_count, | 
					
						
							|  |  |  |             "hit_count": self.hit_count, | 
					
						
							| 
									
										
										
										
											2024-06-28 00:24:37 +08:00
										 |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @classmethod | 
					
						
							|  |  |  |     def from_dict(cls, data: dict): | 
					
						
							|  |  |  |         return cls( | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |             id=data.get("id"), | 
					
						
							|  |  |  |             tenant_id=data.get("tenant_id"), | 
					
						
							|  |  |  |             dataset_id=data.get("dataset_id"), | 
					
						
							|  |  |  |             position=data.get("position"), | 
					
						
							|  |  |  |             data_source_type=data.get("data_source_type"), | 
					
						
							|  |  |  |             data_source_info=data.get("data_source_info"), | 
					
						
							|  |  |  |             dataset_process_rule_id=data.get("dataset_process_rule_id"), | 
					
						
							|  |  |  |             batch=data.get("batch"), | 
					
						
							|  |  |  |             name=data.get("name"), | 
					
						
							|  |  |  |             created_from=data.get("created_from"), | 
					
						
							|  |  |  |             created_by=data.get("created_by"), | 
					
						
							|  |  |  |             created_api_request_id=data.get("created_api_request_id"), | 
					
						
							|  |  |  |             created_at=data.get("created_at"), | 
					
						
							|  |  |  |             processing_started_at=data.get("processing_started_at"), | 
					
						
							|  |  |  |             file_id=data.get("file_id"), | 
					
						
							|  |  |  |             word_count=data.get("word_count"), | 
					
						
							|  |  |  |             parsing_completed_at=data.get("parsing_completed_at"), | 
					
						
							|  |  |  |             cleaning_completed_at=data.get("cleaning_completed_at"), | 
					
						
							|  |  |  |             splitting_completed_at=data.get("splitting_completed_at"), | 
					
						
							|  |  |  |             tokens=data.get("tokens"), | 
					
						
							|  |  |  |             indexing_latency=data.get("indexing_latency"), | 
					
						
							|  |  |  |             completed_at=data.get("completed_at"), | 
					
						
							|  |  |  |             is_paused=data.get("is_paused"), | 
					
						
							|  |  |  |             paused_by=data.get("paused_by"), | 
					
						
							|  |  |  |             paused_at=data.get("paused_at"), | 
					
						
							|  |  |  |             error=data.get("error"), | 
					
						
							|  |  |  |             stopped_at=data.get("stopped_at"), | 
					
						
							|  |  |  |             indexing_status=data.get("indexing_status"), | 
					
						
							|  |  |  |             enabled=data.get("enabled"), | 
					
						
							|  |  |  |             disabled_at=data.get("disabled_at"), | 
					
						
							|  |  |  |             disabled_by=data.get("disabled_by"), | 
					
						
							|  |  |  |             archived=data.get("archived"), | 
					
						
							|  |  |  |             archived_reason=data.get("archived_reason"), | 
					
						
							|  |  |  |             archived_by=data.get("archived_by"), | 
					
						
							|  |  |  |             archived_at=data.get("archived_at"), | 
					
						
							|  |  |  |             updated_at=data.get("updated_at"), | 
					
						
							|  |  |  |             doc_type=data.get("doc_type"), | 
					
						
							|  |  |  |             doc_metadata=data.get("doc_metadata"), | 
					
						
							|  |  |  |             doc_form=data.get("doc_form"), | 
					
						
							|  |  |  |             doc_language=data.get("doc_language"), | 
					
						
							| 
									
										
										
										
											2024-06-28 00:24:37 +08:00
										 |  |  |         ) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | class DocumentSegment(db.Model): | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |     __tablename__ = "document_segments" | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     __table_args__ = ( | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |         db.PrimaryKeyConstraint("id", name="document_segment_pkey"), | 
					
						
							|  |  |  |         db.Index("document_segment_dataset_id_idx", "dataset_id"), | 
					
						
							|  |  |  |         db.Index("document_segment_document_id_idx", "document_id"), | 
					
						
							|  |  |  |         db.Index("document_segment_tenant_dataset_idx", "dataset_id", "tenant_id"), | 
					
						
							|  |  |  |         db.Index("document_segment_tenant_document_idx", "document_id", "tenant_id"), | 
					
						
							|  |  |  |         db.Index("document_segment_dataset_node_idx", "dataset_id", "index_node_id"), | 
					
						
							|  |  |  |         db.Index("document_segment_tenant_idx", "tenant_id"), | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # initial fields | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |     id = db.Column(StringUUID, nullable=False, server_default=db.text("uuid_generate_v4()")) | 
					
						
							| 
									
										
										
										
											2024-04-29 11:58:17 +08:00
										 |  |  |     tenant_id = db.Column(StringUUID, nullable=False) | 
					
						
							|  |  |  |     dataset_id = db.Column(StringUUID, nullable=False) | 
					
						
							|  |  |  |     document_id = db.Column(StringUUID, nullable=False) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     position = db.Column(db.Integer, nullable=False) | 
					
						
							|  |  |  |     content = db.Column(db.Text, nullable=False) | 
					
						
							| 
									
										
										
										
											2023-07-28 20:47:15 +08:00
										 |  |  |     answer = db.Column(db.Text, nullable=True) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     word_count = db.Column(db.Integer, nullable=False) | 
					
						
							|  |  |  |     tokens = db.Column(db.Integer, nullable=False) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # indexing fields | 
					
						
							|  |  |  |     keywords = db.Column(db.JSON, nullable=True) | 
					
						
							|  |  |  |     index_node_id = db.Column(db.String(255), nullable=True) | 
					
						
							|  |  |  |     index_node_hash = db.Column(db.String(255), nullable=True) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # basic fields | 
					
						
							|  |  |  |     hit_count = db.Column(db.Integer, nullable=False, default=0) | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |     enabled = db.Column(db.Boolean, nullable=False, server_default=db.text("true")) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     disabled_at = db.Column(db.DateTime, nullable=True) | 
					
						
							| 
									
										
										
										
											2024-04-29 11:58:17 +08:00
										 |  |  |     disabled_by = db.Column(StringUUID, nullable=True) | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |     status = db.Column(db.String(255), nullable=False, server_default=db.text("'waiting'::character varying")) | 
					
						
							| 
									
										
										
										
											2024-04-29 11:58:17 +08:00
										 |  |  |     created_by = db.Column(StringUUID, nullable=False) | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |     created_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)")) | 
					
						
							| 
									
										
										
										
											2024-04-29 11:58:17 +08:00
										 |  |  |     updated_by = db.Column(StringUUID, nullable=True) | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |     updated_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)")) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     indexing_at = db.Column(db.DateTime, nullable=True) | 
					
						
							|  |  |  |     completed_at = db.Column(db.DateTime, nullable=True) | 
					
						
							|  |  |  |     error = db.Column(db.Text, nullable=True) | 
					
						
							|  |  |  |     stopped_at = db.Column(db.DateTime, nullable=True) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def dataset(self): | 
					
						
							|  |  |  |         return db.session.query(Dataset).filter(Dataset.id == self.dataset_id).first() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def document(self): | 
					
						
							|  |  |  |         return db.session.query(Document).filter(Document.id == self.document_id).first() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def previous_segment(self): | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |         return ( | 
					
						
							|  |  |  |             db.session.query(DocumentSegment) | 
					
						
							|  |  |  |             .filter(DocumentSegment.document_id == self.document_id, DocumentSegment.position == self.position - 1) | 
					
						
							|  |  |  |             .first() | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def next_segment(self): | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |         return ( | 
					
						
							|  |  |  |             db.session.query(DocumentSegment) | 
					
						
							|  |  |  |             .filter(DocumentSegment.document_id == self.document_id, DocumentSegment.position == self.position + 1) | 
					
						
							|  |  |  |             .first() | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-23 18:05:23 +08:00
										 |  |  |     def get_sign_content(self): | 
					
						
							|  |  |  |         pattern = r"/files/([a-f0-9\-]+)/image-preview" | 
					
						
							|  |  |  |         text = self.content | 
					
						
							| 
									
										
										
										
											2024-06-19 12:36:40 +08:00
										 |  |  |         matches = re.finditer(pattern, text) | 
					
						
							|  |  |  |         signed_urls = [] | 
					
						
							|  |  |  |         for match in matches: | 
					
						
							| 
									
										
										
										
											2024-05-23 18:05:23 +08:00
										 |  |  |             upload_file_id = match.group(1) | 
					
						
							|  |  |  |             nonce = os.urandom(16).hex() | 
					
						
							|  |  |  |             timestamp = str(int(time.time())) | 
					
						
							|  |  |  |             data_to_sign = f"image-preview|{upload_file_id}|{timestamp}|{nonce}" | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |             secret_key = dify_config.SECRET_KEY.encode() if dify_config.SECRET_KEY else b"" | 
					
						
							| 
									
										
										
										
											2024-05-23 18:05:23 +08:00
										 |  |  |             sign = hmac.new(secret_key, data_to_sign.encode(), hashlib.sha256).digest() | 
					
						
							|  |  |  |             encoded_sign = base64.urlsafe_b64encode(sign).decode() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             params = f"timestamp={timestamp}&nonce={nonce}&sign={encoded_sign}" | 
					
						
							| 
									
										
										
										
											2024-06-19 12:36:40 +08:00
										 |  |  |             signed_url = f"{match.group(0)}?{params}" | 
					
						
							|  |  |  |             signed_urls.append((match.start(), match.end(), signed_url)) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # Reconstruct the text with signed URLs | 
					
						
							|  |  |  |         offset = 0 | 
					
						
							|  |  |  |         for start, end, signed_url in signed_urls: | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |             text = text[: start + offset] + signed_url + text[end + offset :] | 
					
						
							| 
									
										
										
										
											2024-06-19 12:36:40 +08:00
										 |  |  |             offset += len(signed_url) - (end - start) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-23 18:05:23 +08:00
										 |  |  |         return text | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | class AppDatasetJoin(db.Model): | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |     __tablename__ = "app_dataset_joins" | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     __table_args__ = ( | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |         db.PrimaryKeyConstraint("id", name="app_dataset_join_pkey"), | 
					
						
							|  |  |  |         db.Index("app_dataset_join_app_dataset_idx", "dataset_id", "app_id"), | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |     id = db.Column(StringUUID, primary_key=True, nullable=False, server_default=db.text("uuid_generate_v4()")) | 
					
						
							| 
									
										
										
										
											2024-04-29 11:58:17 +08:00
										 |  |  |     app_id = db.Column(StringUUID, nullable=False) | 
					
						
							|  |  |  |     dataset_id = db.Column(StringUUID, nullable=False) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     created_at = db.Column(db.DateTime, nullable=False, server_default=db.func.current_timestamp()) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def app(self): | 
					
						
							| 
									
										
										
										
											2024-07-17 13:54:35 +08:00
										 |  |  |         return db.session.get(App, self.app_id) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class DatasetQuery(db.Model): | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |     __tablename__ = "dataset_queries" | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     __table_args__ = ( | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |         db.PrimaryKeyConstraint("id", name="dataset_query_pkey"), | 
					
						
							|  |  |  |         db.Index("dataset_query_dataset_id_idx", "dataset_id"), | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |     id = db.Column(StringUUID, primary_key=True, nullable=False, server_default=db.text("uuid_generate_v4()")) | 
					
						
							| 
									
										
										
										
											2024-04-29 11:58:17 +08:00
										 |  |  |     dataset_id = db.Column(StringUUID, nullable=False) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     content = db.Column(db.Text, nullable=False) | 
					
						
							|  |  |  |     source = db.Column(db.String(255), nullable=False) | 
					
						
							| 
									
										
										
										
											2024-04-29 11:58:17 +08:00
										 |  |  |     source_app_id = db.Column(StringUUID, nullable=True) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     created_by_role = db.Column(db.String, nullable=False) | 
					
						
							| 
									
										
										
										
											2024-04-29 11:58:17 +08:00
										 |  |  |     created_by = db.Column(StringUUID, nullable=False) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     created_at = db.Column(db.DateTime, nullable=False, server_default=db.func.current_timestamp()) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class DatasetKeywordTable(db.Model): | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |     __tablename__ = "dataset_keyword_tables" | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     __table_args__ = ( | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |         db.PrimaryKeyConstraint("id", name="dataset_keyword_table_pkey"), | 
					
						
							|  |  |  |         db.Index("dataset_keyword_table_dataset_id_idx", "dataset_id"), | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |     id = db.Column(StringUUID, primary_key=True, server_default=db.text("uuid_generate_v4()")) | 
					
						
							| 
									
										
										
										
											2024-04-29 11:58:17 +08:00
										 |  |  |     dataset_id = db.Column(StringUUID, nullable=False, unique=True) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     keyword_table = db.Column(db.Text, nullable=False) | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |     data_source_type = db.Column( | 
					
						
							|  |  |  |         db.String(255), nullable=False, server_default=db.text("'database'::character varying") | 
					
						
							|  |  |  |     ) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def keyword_table_dict(self): | 
					
						
							| 
									
										
										
										
											2023-06-25 16:49:14 +08:00
										 |  |  |         class SetDecoder(json.JSONDecoder): | 
					
						
							|  |  |  |             def __init__(self, *args, **kwargs): | 
					
						
							|  |  |  |                 super().__init__(object_hook=self.object_hook, *args, **kwargs) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             def object_hook(self, dct): | 
					
						
							|  |  |  |                 if isinstance(dct, dict): | 
					
						
							|  |  |  |                     for keyword, node_idxs in dct.items(): | 
					
						
							|  |  |  |                         if isinstance(node_idxs, list): | 
					
						
							|  |  |  |                             dct[keyword] = set(node_idxs) | 
					
						
							|  |  |  |                 return dct | 
					
						
							| 
									
										
										
										
											2024-04-02 20:46:24 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-01 20:19:30 +08:00
										 |  |  |         # get dataset | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |         dataset = Dataset.query.filter_by(id=self.dataset_id).first() | 
					
						
							| 
									
										
										
										
											2024-04-01 20:19:30 +08:00
										 |  |  |         if not dataset: | 
					
						
							|  |  |  |             return None | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |         if self.data_source_type == "database": | 
					
						
							| 
									
										
										
										
											2024-04-01 20:19:30 +08:00
										 |  |  |             return json.loads(self.keyword_table, cls=SetDecoder) if self.keyword_table else None | 
					
						
							|  |  |  |         else: | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |             file_key = "keyword_files/" + dataset.tenant_id + "/" + self.dataset_id + ".txt" | 
					
						
							| 
									
										
										
										
											2024-04-01 20:19:30 +08:00
										 |  |  |             try: | 
					
						
							|  |  |  |                 keyword_table_text = storage.load_once(file_key) | 
					
						
							|  |  |  |                 if keyword_table_text: | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |                     return json.loads(keyword_table_text.decode("utf-8"), cls=SetDecoder) | 
					
						
							| 
									
										
										
										
											2024-04-01 20:19:30 +08:00
										 |  |  |                 return None | 
					
						
							|  |  |  |             except Exception as e: | 
					
						
							|  |  |  |                 logging.exception(str(e)) | 
					
						
							|  |  |  |                 return None | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class Embedding(db.Model): | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |     __tablename__ = "embeddings" | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     __table_args__ = ( | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |         db.PrimaryKeyConstraint("id", name="embedding_pkey"), | 
					
						
							|  |  |  |         db.UniqueConstraint("model_name", "hash", "provider_name", name="embedding_hash_idx"), | 
					
						
							|  |  |  |         db.Index("created_at_idx", "created_at"), | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |     id = db.Column(StringUUID, primary_key=True, server_default=db.text("uuid_generate_v4()")) | 
					
						
							|  |  |  |     model_name = db.Column( | 
					
						
							|  |  |  |         db.String(255), nullable=False, server_default=db.text("'text-embedding-ada-002'::character varying") | 
					
						
							|  |  |  |     ) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  |     hash = db.Column(db.String(64), nullable=False) | 
					
						
							|  |  |  |     embedding = db.Column(db.LargeBinary, nullable=False) | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |     created_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)")) | 
					
						
							|  |  |  |     provider_name = db.Column(db.String(255), nullable=False, server_default=db.text("''::character varying")) | 
					
						
							| 
									
										
										
										
											2023-05-15 08:51:32 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def set_embedding(self, embedding_data: list[float]): | 
					
						
							|  |  |  |         self.embedding = pickle.dumps(embedding_data, protocol=pickle.HIGHEST_PROTOCOL) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def get_embedding(self) -> list[float]: | 
					
						
							| 
									
										
										
										
											2023-08-12 00:57:00 +08:00
										 |  |  |         return pickle.loads(self.embedding) | 
					
						
							| 
									
										
										
										
											2023-09-18 18:15:41 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class DatasetCollectionBinding(db.Model): | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |     __tablename__ = "dataset_collection_bindings" | 
					
						
							| 
									
										
										
										
											2023-09-18 18:15:41 +08:00
										 |  |  |     __table_args__ = ( | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |         db.PrimaryKeyConstraint("id", name="dataset_collection_bindings_pkey"), | 
					
						
							|  |  |  |         db.Index("provider_model_name_idx", "provider_name", "model_name"), | 
					
						
							| 
									
										
										
										
											2023-09-18 18:15:41 +08:00
										 |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |     id = db.Column(StringUUID, primary_key=True, server_default=db.text("uuid_generate_v4()")) | 
					
						
							| 
									
										
										
										
											2023-09-18 18:15:41 +08:00
										 |  |  |     provider_name = db.Column(db.String(40), nullable=False) | 
					
						
							| 
									
										
										
										
											2024-07-28 10:42:58 +09:00
										 |  |  |     model_name = db.Column(db.String(255), nullable=False) | 
					
						
							| 
									
										
										
										
											2023-12-18 13:10:05 +08:00
										 |  |  |     type = db.Column(db.String(40), server_default=db.text("'dataset'::character varying"), nullable=False) | 
					
						
							| 
									
										
										
										
											2023-09-18 18:15:41 +08:00
										 |  |  |     collection_name = db.Column(db.String(64), nullable=False) | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |     created_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)")) | 
					
						
							| 
									
										
										
										
											2024-07-09 17:47:54 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class DatasetPermission(db.Model): | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |     __tablename__ = "dataset_permissions" | 
					
						
							| 
									
										
										
										
											2024-07-09 17:47:54 +08:00
										 |  |  |     __table_args__ = ( | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |         db.PrimaryKeyConstraint("id", name="dataset_permission_pkey"), | 
					
						
							|  |  |  |         db.Index("idx_dataset_permissions_dataset_id", "dataset_id"), | 
					
						
							|  |  |  |         db.Index("idx_dataset_permissions_account_id", "account_id"), | 
					
						
							|  |  |  |         db.Index("idx_dataset_permissions_tenant_id", "tenant_id"), | 
					
						
							| 
									
										
										
										
											2024-07-09 17:47:54 +08:00
										 |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |     id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"), primary_key=True) | 
					
						
							| 
									
										
										
										
											2024-07-09 17:47:54 +08:00
										 |  |  |     dataset_id = db.Column(StringUUID, nullable=False) | 
					
						
							|  |  |  |     account_id = db.Column(StringUUID, nullable=False) | 
					
						
							|  |  |  |     tenant_id = db.Column(StringUUID, nullable=False) | 
					
						
							| 
									
										
										
										
											2024-09-10 17:08:06 +08:00
										 |  |  |     has_permission = db.Column(db.Boolean, nullable=False, server_default=db.text("true")) | 
					
						
							|  |  |  |     created_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)")) |