| 
									
										
										
										
											2023-09-27 16:06:32 +08:00
										 |  |  | import datetime | 
					
						
							|  |  |  | import hashlib | 
					
						
							|  |  |  | import uuid | 
					
						
							| 
									
										
										
										
											2024-02-09 15:21:33 +08:00
										 |  |  | from collections.abc import Generator | 
					
						
							|  |  |  | from typing import Union | 
					
						
							| 
									
										
										
										
											2023-09-27 16:06:32 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-06 13:21:13 +08:00
										 |  |  | from flask_login import current_user | 
					
						
							|  |  |  | from werkzeug.datastructures import FileStorage | 
					
						
							|  |  |  | from werkzeug.exceptions import NotFound | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-07-12 12:25:38 +08:00
										 |  |  | from configs import dify_config | 
					
						
							| 
									
										
										
										
											2023-11-13 22:05:46 +08:00
										 |  |  | from core.file.upload_file_parser import UploadFileParser | 
					
						
							| 
									
										
										
										
											2024-02-22 23:31:57 +08:00
										 |  |  | from core.rag.extractor.extract_processor import ExtractProcessor | 
					
						
							| 
									
										
										
										
											2023-09-27 16:06:32 +08:00
										 |  |  | from extensions.ext_database import db | 
					
						
							| 
									
										
										
										
											2024-01-12 12:34:01 +08:00
										 |  |  | from extensions.ext_storage import storage | 
					
						
							| 
									
										
										
										
											2023-11-13 22:05:46 +08:00
										 |  |  | from models.account import Account | 
					
						
							| 
									
										
										
										
											2024-01-12 12:34:01 +08:00
										 |  |  | from models.model import EndUser, UploadFile | 
					
						
							| 
									
										
										
										
											2023-09-27 16:06:32 +08:00
										 |  |  | from services.errors.file import FileTooLargeError, UnsupportedFileTypeError | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-12-18 16:25:37 +08:00
										 |  |  | IMAGE_EXTENSIONS = ['jpg', 'jpeg', 'png', 'webp', 'gif', 'svg'] | 
					
						
							| 
									
										
										
										
											2024-01-21 16:58:06 +08:00
										 |  |  | IMAGE_EXTENSIONS.extend([ext.upper() for ext in IMAGE_EXTENSIONS]) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-12 14:53:44 +08:00
										 |  |  | ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'xls', 'docx', 'csv'] | 
					
						
							| 
									
										
										
										
											2024-06-23 00:01:02 +09:00
										 |  |  | UNSTRUCTURED_ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'xls', | 
					
						
							| 
									
										
										
										
											2024-04-12 11:25:02 +08:00
										 |  |  |                                    'docx', 'csv', 'eml', 'msg', 'pptx', 'ppt', 'xml', 'epub'] | 
					
						
							| 
									
										
										
										
											2024-04-12 14:53:44 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-09-27 16:06:32 +08:00
										 |  |  | PREVIEW_WORDS_LIMIT = 3000 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class FileService: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @staticmethod | 
					
						
							| 
									
										
										
										
											2023-11-13 22:05:46 +08:00
										 |  |  |     def upload_file(file: FileStorage, user: Union[Account, EndUser], only_image: bool = False) -> UploadFile: | 
					
						
							| 
									
										
										
										
											2024-06-17 20:36:54 +08:00
										 |  |  |         filename = file.filename | 
					
						
							| 
									
										
										
										
											2023-11-13 22:05:46 +08:00
										 |  |  |         extension = file.filename.split('.')[-1] | 
					
						
							| 
									
										
										
										
											2024-06-17 20:36:54 +08:00
										 |  |  |         if len(filename) > 200: | 
					
						
							|  |  |  |             filename = filename.split('.')[0][:200] + '.' + extension | 
					
						
							| 
									
										
										
										
											2024-07-12 12:25:38 +08:00
										 |  |  |         etl_type = dify_config.ETL_TYPE | 
					
						
							| 
									
										
										
										
											2024-06-23 00:01:02 +09:00
										 |  |  |         allowed_extensions = UNSTRUCTURED_ALLOWED_EXTENSIONS + IMAGE_EXTENSIONS if etl_type == 'Unstructured' \ | 
					
						
							| 
									
										
										
										
											2024-02-22 23:31:57 +08:00
										 |  |  |             else ALLOWED_EXTENSIONS + IMAGE_EXTENSIONS | 
					
						
							| 
									
										
										
										
											2023-12-18 23:24:06 +08:00
										 |  |  |         if extension.lower() not in allowed_extensions: | 
					
						
							| 
									
										
										
										
											2023-11-13 22:05:46 +08:00
										 |  |  |             raise UnsupportedFileTypeError() | 
					
						
							|  |  |  |         elif only_image and extension.lower() not in IMAGE_EXTENSIONS: | 
					
						
							|  |  |  |             raise UnsupportedFileTypeError() | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-09-27 16:06:32 +08:00
										 |  |  |         # read file content | 
					
						
							|  |  |  |         file_content = file.read() | 
					
						
							| 
									
										
										
										
											2023-11-13 22:05:46 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-09-27 16:06:32 +08:00
										 |  |  |         # get file size | 
					
						
							|  |  |  |         file_size = len(file_content) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-11-13 22:05:46 +08:00
										 |  |  |         if extension.lower() in IMAGE_EXTENSIONS: | 
					
						
							| 
									
										
										
										
											2024-07-12 12:25:38 +08:00
										 |  |  |             file_size_limit = dify_config.UPLOAD_IMAGE_FILE_SIZE_LIMIT * 1024 * 1024 | 
					
						
							| 
									
										
										
										
											2023-11-13 22:05:46 +08:00
										 |  |  |         else: | 
					
						
							| 
									
										
										
										
											2024-07-12 12:25:38 +08:00
										 |  |  |             file_size_limit = dify_config.UPLOAD_FILE_SIZE_LIMIT * 1024 * 1024 | 
					
						
							| 
									
										
										
										
											2023-11-13 22:05:46 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-09-27 16:06:32 +08:00
										 |  |  |         if file_size > file_size_limit: | 
					
						
							|  |  |  |             message = f'File size exceeded. {file_size} > {file_size_limit}' | 
					
						
							|  |  |  |             raise FileTooLargeError(message) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # user uuid as file name | 
					
						
							|  |  |  |         file_uuid = str(uuid.uuid4()) | 
					
						
							| 
									
										
										
										
											2023-11-13 22:05:46 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |         if isinstance(user, Account): | 
					
						
							|  |  |  |             current_tenant_id = user.current_tenant_id | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             # end_user | 
					
						
							|  |  |  |             current_tenant_id = user.tenant_id | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         file_key = 'upload_files/' + current_tenant_id + '/' + file_uuid + '.' + extension | 
					
						
							| 
									
										
										
										
											2023-09-27 16:06:32 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |         # save file to storage | 
					
						
							|  |  |  |         storage.save(file_key, file_content) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # save file to db | 
					
						
							|  |  |  |         upload_file = UploadFile( | 
					
						
							| 
									
										
										
										
											2023-11-13 22:05:46 +08:00
										 |  |  |             tenant_id=current_tenant_id, | 
					
						
							| 
									
										
										
										
											2024-07-12 12:25:38 +08:00
										 |  |  |             storage_type=dify_config.STORAGE_TYPE, | 
					
						
							| 
									
										
										
										
											2023-09-27 16:06:32 +08:00
										 |  |  |             key=file_key, | 
					
						
							| 
									
										
										
										
											2024-06-17 20:36:54 +08:00
										 |  |  |             name=filename, | 
					
						
							| 
									
										
										
										
											2023-09-27 16:06:32 +08:00
										 |  |  |             size=file_size, | 
					
						
							|  |  |  |             extension=extension, | 
					
						
							|  |  |  |             mime_type=file.mimetype, | 
					
						
							| 
									
										
										
										
											2023-11-13 22:05:46 +08:00
										 |  |  |             created_by_role=('account' if isinstance(user, Account) else 'end_user'), | 
					
						
							|  |  |  |             created_by=user.id, | 
					
						
							| 
									
										
										
										
											2024-04-12 16:22:24 +08:00
										 |  |  |             created_at=datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None), | 
					
						
							| 
									
										
										
										
											2023-09-27 16:06:32 +08:00
										 |  |  |             used=False, | 
					
						
							|  |  |  |             hash=hashlib.sha3_256(file_content).hexdigest() | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         db.session.add(upload_file) | 
					
						
							|  |  |  |         db.session.commit() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         return upload_file | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @staticmethod | 
					
						
							|  |  |  |     def upload_text(text: str, text_name: str) -> UploadFile: | 
					
						
							| 
									
										
										
										
											2024-06-17 20:36:54 +08:00
										 |  |  |         if len(text_name) > 200: | 
					
						
							|  |  |  |             text_name = text_name[:200] | 
					
						
							| 
									
										
										
										
											2023-09-27 16:06:32 +08:00
										 |  |  |         # user uuid as file name | 
					
						
							|  |  |  |         file_uuid = str(uuid.uuid4()) | 
					
						
							|  |  |  |         file_key = 'upload_files/' + current_user.current_tenant_id + '/' + file_uuid + '.txt' | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # save file to storage | 
					
						
							|  |  |  |         storage.save(file_key, text.encode('utf-8')) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # save file to db | 
					
						
							|  |  |  |         upload_file = UploadFile( | 
					
						
							|  |  |  |             tenant_id=current_user.current_tenant_id, | 
					
						
							| 
									
										
										
										
											2024-07-12 12:25:38 +08:00
										 |  |  |             storage_type=dify_config.STORAGE_TYPE, | 
					
						
							| 
									
										
										
										
											2023-09-27 16:06:32 +08:00
										 |  |  |             key=file_key, | 
					
						
							|  |  |  |             name=text_name + '.txt', | 
					
						
							|  |  |  |             size=len(text), | 
					
						
							|  |  |  |             extension='txt', | 
					
						
							|  |  |  |             mime_type='text/plain', | 
					
						
							|  |  |  |             created_by=current_user.id, | 
					
						
							| 
									
										
										
										
											2024-04-12 16:22:24 +08:00
										 |  |  |             created_at=datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None), | 
					
						
							| 
									
										
										
										
											2023-09-27 16:06:32 +08:00
										 |  |  |             used=True, | 
					
						
							|  |  |  |             used_by=current_user.id, | 
					
						
							| 
									
										
										
										
											2024-04-12 16:22:24 +08:00
										 |  |  |             used_at=datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None) | 
					
						
							| 
									
										
										
										
											2023-09-27 16:06:32 +08:00
										 |  |  |         ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         db.session.add(upload_file) | 
					
						
							|  |  |  |         db.session.commit() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         return upload_file | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @staticmethod | 
					
						
							|  |  |  |     def get_file_preview(file_id: str) -> str: | 
					
						
							|  |  |  |         upload_file = db.session.query(UploadFile) \ | 
					
						
							|  |  |  |             .filter(UploadFile.id == file_id) \ | 
					
						
							|  |  |  |             .first() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         if not upload_file: | 
					
						
							|  |  |  |             raise NotFound("File not found") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # extract text from file | 
					
						
							|  |  |  |         extension = upload_file.extension | 
					
						
							| 
									
										
										
										
											2024-07-12 12:25:38 +08:00
										 |  |  |         etl_type = dify_config.ETL_TYPE | 
					
						
							| 
									
										
										
										
											2024-06-23 00:01:02 +09:00
										 |  |  |         allowed_extensions = UNSTRUCTURED_ALLOWED_EXTENSIONS if etl_type == 'Unstructured' else ALLOWED_EXTENSIONS | 
					
						
							| 
									
										
										
										
											2023-12-26 15:06:44 +08:00
										 |  |  |         if extension.lower() not in allowed_extensions: | 
					
						
							| 
									
										
										
										
											2023-09-27 16:06:32 +08:00
										 |  |  |             raise UnsupportedFileTypeError() | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-22 23:31:57 +08:00
										 |  |  |         text = ExtractProcessor.load_from_upload_file(upload_file, return_text=True) | 
					
						
							| 
									
										
										
										
											2023-09-27 16:06:32 +08:00
										 |  |  |         text = text[0:PREVIEW_WORDS_LIMIT] if text else '' | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         return text | 
					
						
							| 
									
										
										
										
											2023-11-13 22:05:46 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     @staticmethod | 
					
						
							| 
									
										
										
										
											2024-02-09 15:21:33 +08:00
										 |  |  |     def get_image_preview(file_id: str, timestamp: str, nonce: str, sign: str) -> tuple[Generator, str]: | 
					
						
							| 
									
										
										
										
											2023-11-13 22:05:46 +08:00
										 |  |  |         result = UploadFileParser.verify_image_file_signature(file_id, timestamp, nonce, sign) | 
					
						
							|  |  |  |         if not result: | 
					
						
							|  |  |  |             raise NotFound("File not found or signature is invalid") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         upload_file = db.session.query(UploadFile) \ | 
					
						
							|  |  |  |             .filter(UploadFile.id == file_id) \ | 
					
						
							|  |  |  |             .first() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         if not upload_file: | 
					
						
							|  |  |  |             raise NotFound("File not found or signature is invalid") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # extract text from file | 
					
						
							|  |  |  |         extension = upload_file.extension | 
					
						
							|  |  |  |         if extension.lower() not in IMAGE_EXTENSIONS: | 
					
						
							|  |  |  |             raise UnsupportedFileTypeError() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         generator = storage.load(upload_file.key, stream=True) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         return generator, upload_file.mime_type | 
					
						
							| 
									
										
										
										
											2024-02-20 16:05:09 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-12-18 16:25:37 +08:00
										 |  |  |     @staticmethod | 
					
						
							| 
									
										
										
										
											2024-02-22 23:31:57 +08:00
										 |  |  |     def get_public_image_preview(file_id: str) -> tuple[Generator, str]: | 
					
						
							| 
									
										
										
										
											2023-12-18 16:25:37 +08:00
										 |  |  |         upload_file = db.session.query(UploadFile) \ | 
					
						
							|  |  |  |             .filter(UploadFile.id == file_id) \ | 
					
						
							|  |  |  |             .first() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         if not upload_file: | 
					
						
							|  |  |  |             raise NotFound("File not found or signature is invalid") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # extract text from file | 
					
						
							|  |  |  |         extension = upload_file.extension | 
					
						
							|  |  |  |         if extension.lower() not in IMAGE_EXTENSIONS: | 
					
						
							|  |  |  |             raise UnsupportedFileTypeError() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         generator = storage.load(upload_file.key) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         return generator, upload_file.mime_type |