| 
									
										
										
										
											2024-12-18 09:05:54 +08:00
										 |  |  | import logging | 
					
						
							|  |  |  | import os | 
					
						
							| 
									
										
										
										
											2024-12-11 14:50:54 +08:00
										 |  |  | from collections.abc import Generator | 
					
						
							|  |  |  | from pathlib import Path | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-24 18:38:51 +08:00
										 |  |  | import opendal  # type: ignore[import] | 
					
						
							| 
									
										
										
										
											2024-12-18 09:05:54 +08:00
										 |  |  | from dotenv import dotenv_values | 
					
						
							| 
									
										
										
										
											2024-12-11 14:50:54 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | from extensions.storage.base_storage import BaseStorage | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-18 09:05:54 +08:00
										 |  |  | logger = logging.getLogger(__name__) | 
					
						
							| 
									
										
										
										
											2024-12-11 14:50:54 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-18 09:05:54 +08:00
										 |  |  | def _get_opendal_kwargs(*, scheme: str, env_file_path: str = ".env", prefix: str = "OPENDAL_"): | 
					
						
							|  |  |  |     kwargs = {} | 
					
						
							|  |  |  |     config_prefix = prefix + scheme.upper() + "_" | 
					
						
							|  |  |  |     for key, value in os.environ.items(): | 
					
						
							|  |  |  |         if key.startswith(config_prefix): | 
					
						
							|  |  |  |             kwargs[key[len(config_prefix) :].lower()] = value | 
					
						
							| 
									
										
										
										
											2024-12-11 14:50:54 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-24 18:38:51 +08:00
										 |  |  |     file_env_vars: dict = dotenv_values(env_file_path) or {} | 
					
						
							| 
									
										
										
										
											2024-12-18 09:05:54 +08:00
										 |  |  |     for key, value in file_env_vars.items(): | 
					
						
							|  |  |  |         if key.startswith(config_prefix) and key[len(config_prefix) :].lower() not in kwargs and value: | 
					
						
							|  |  |  |             kwargs[key[len(config_prefix) :].lower()] = value | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return kwargs | 
					
						
							| 
									
										
										
										
											2024-12-11 14:50:54 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class OpenDALStorage(BaseStorage): | 
					
						
							| 
									
										
										
										
											2024-12-18 09:05:54 +08:00
										 |  |  |     def __init__(self, scheme: str, **kwargs): | 
					
						
							|  |  |  |         kwargs = kwargs or _get_opendal_kwargs(scheme=scheme) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         if scheme == "fs": | 
					
						
							|  |  |  |             root = kwargs.get("root", "storage") | 
					
						
							|  |  |  |             Path(root).mkdir(parents=True, exist_ok=True) | 
					
						
							| 
									
										
										
										
											2024-12-11 14:50:54 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-02-17 17:05:13 +08:00
										 |  |  |         self.op = opendal.Operator(scheme=scheme, **kwargs)  # type: ignore | 
					
						
							| 
									
										
										
										
											2024-12-18 09:05:54 +08:00
										 |  |  |         logger.debug(f"opendal operator created with scheme {scheme}") | 
					
						
							|  |  |  |         retry_layer = opendal.layers.RetryLayer(max_times=3, factor=2.0, jitter=True) | 
					
						
							|  |  |  |         self.op = self.op.layer(retry_layer) | 
					
						
							|  |  |  |         logger.debug("added retry layer to opendal operator") | 
					
						
							| 
									
										
										
										
											2024-12-11 14:50:54 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def save(self, filename: str, data: bytes) -> None: | 
					
						
							|  |  |  |         self.op.write(path=filename, bs=data) | 
					
						
							| 
									
										
										
										
											2024-12-18 09:05:54 +08:00
										 |  |  |         logger.debug(f"file {filename} saved") | 
					
						
							| 
									
										
										
										
											2024-12-11 14:50:54 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def load_once(self, filename: str) -> bytes: | 
					
						
							|  |  |  |         if not self.exists(filename): | 
					
						
							|  |  |  |             raise FileNotFoundError("File not found") | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-24 18:38:51 +08:00
										 |  |  |         content: bytes = self.op.read(path=filename) | 
					
						
							| 
									
										
										
										
											2024-12-18 09:05:54 +08:00
										 |  |  |         logger.debug(f"file {filename} loaded") | 
					
						
							|  |  |  |         return content | 
					
						
							| 
									
										
										
										
											2024-12-11 14:50:54 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def load_stream(self, filename: str) -> Generator: | 
					
						
							|  |  |  |         if not self.exists(filename): | 
					
						
							|  |  |  |             raise FileNotFoundError("File not found") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         batch_size = 4096 | 
					
						
							|  |  |  |         file = self.op.open(path=filename, mode="rb") | 
					
						
							|  |  |  |         while chunk := file.read(batch_size): | 
					
						
							|  |  |  |             yield chunk | 
					
						
							| 
									
										
										
										
											2024-12-18 09:05:54 +08:00
										 |  |  |         logger.debug(f"file {filename} loaded as stream") | 
					
						
							| 
									
										
										
										
											2024-12-11 14:50:54 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def download(self, filename: str, target_filepath: str): | 
					
						
							|  |  |  |         if not self.exists(filename): | 
					
						
							|  |  |  |             raise FileNotFoundError("File not found") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         with Path(target_filepath).open("wb") as f: | 
					
						
							|  |  |  |             f.write(self.op.read(path=filename)) | 
					
						
							| 
									
										
										
										
											2024-12-18 09:05:54 +08:00
										 |  |  |         logger.debug(f"file {filename} downloaded to {target_filepath}") | 
					
						
							| 
									
										
										
										
											2024-12-11 14:50:54 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-12 09:33:30 +08:00
										 |  |  |     def exists(self, filename: str) -> bool: | 
					
						
							| 
									
										
										
										
											2025-03-11 20:44:09 +08:00
										 |  |  |         res: bool = self.op.exists(path=filename) | 
					
						
							|  |  |  |         return res | 
					
						
							| 
									
										
										
										
											2024-12-11 14:50:54 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def delete(self, filename: str): | 
					
						
							|  |  |  |         if self.exists(filename): | 
					
						
							|  |  |  |             self.op.delete(path=filename) | 
					
						
							| 
									
										
										
										
											2024-12-18 09:05:54 +08:00
										 |  |  |             logger.debug(f"file {filename} deleted") | 
					
						
							|  |  |  |             return | 
					
						
							|  |  |  |         logger.debug(f"file {filename} not found, skip delete") | 
					
						
							| 
									
										
										
										
											2025-04-27 12:11:04 +09:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def scan(self, path: str, files: bool = True, directories: bool = False) -> list[str]: | 
					
						
							|  |  |  |         if not self.exists(path): | 
					
						
							|  |  |  |             raise FileNotFoundError("Path not found") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         all_files = self.op.scan(path=path) | 
					
						
							|  |  |  |         if files and directories: | 
					
						
							|  |  |  |             logger.debug(f"files and directories on {path} scanned") | 
					
						
							|  |  |  |             return [f.path for f in all_files] | 
					
						
							|  |  |  |         if files: | 
					
						
							|  |  |  |             logger.debug(f"files on {path} scanned") | 
					
						
							|  |  |  |             return [f.path for f in all_files if not f.path.endswith("/")] | 
					
						
							|  |  |  |         elif directories: | 
					
						
							|  |  |  |             logger.debug(f"directories on {path} scanned") | 
					
						
							|  |  |  |             return [f.path for f in all_files if f.path.endswith("/")] | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             raise ValueError("At least one of files or directories must be True") |