2025-02-26 18:11:02 +08:00
|
|
|
import os
|
2025-02-26 05:38:38 +08:00
|
|
|
from multiprocessing.synchronize import Lock as ProcessLock
|
|
|
|
from threading import Lock as ThreadLock
|
|
|
|
from multiprocessing import Manager
|
|
|
|
from typing import Any, Dict, Optional, Union
|
2025-02-26 18:11:02 +08:00
|
|
|
from lightrag.utils import logger
|
2025-02-26 05:38:38 +08:00
|
|
|
|
|
|
|
LockType = Union[ProcessLock, ThreadLock]
|
|
|
|
|
2025-02-26 18:11:02 +08:00
|
|
|
_manager = None
|
2025-02-27 08:48:33 +08:00
|
|
|
_initialized = None
|
|
|
|
_is_multiprocess = None
|
|
|
|
is_multiprocess = None
|
2025-02-26 18:11:02 +08:00
|
|
|
|
|
|
|
# shared data for storage across processes
|
2025-02-27 08:48:33 +08:00
|
|
|
_shared_dicts: Optional[Dict[str, Any]] = None
|
|
|
|
_share_objects: Optional[Dict[str, Any]] = None
|
2025-02-26 18:11:02 +08:00
|
|
|
_init_flags: Optional[Dict[str, bool]] = None # namespace -> initialized
|
|
|
|
|
2025-02-27 08:48:33 +08:00
|
|
|
_global_lock: Optional[LockType] = None
|
2025-02-26 18:11:16 +08:00
|
|
|
|
|
|
|
|
2025-02-27 08:48:33 +08:00
|
|
|
def initialize_share_data(workers: int = 1):
|
|
|
|
"""Initialize storage data"""
|
|
|
|
global _manager, _is_multiprocess, is_multiprocess, _global_lock, _shared_dicts, _share_objects, _init_flags, _initialized
|
|
|
|
|
|
|
|
if _initialized and _initialized.value:
|
|
|
|
is_multiprocess = _is_multiprocess.value
|
|
|
|
if _is_multiprocess.value:
|
|
|
|
logger.info(f"Process {os.getpid()} storage data already initialized!")
|
|
|
|
return
|
|
|
|
|
|
|
|
_manager = Manager()
|
|
|
|
_initialized = _manager.Value("b", False)
|
|
|
|
_is_multiprocess = _manager.Value("b", False)
|
|
|
|
|
|
|
|
if workers == 1:
|
|
|
|
_is_multiprocess.value = False
|
|
|
|
_global_lock = ThreadLock()
|
|
|
|
_shared_dicts = {}
|
|
|
|
_share_objects = {}
|
|
|
|
_init_flags = {}
|
|
|
|
logger.info(f"Process {os.getpid()} storage data created for Single Process")
|
|
|
|
else:
|
|
|
|
_is_multiprocess.value = True
|
|
|
|
_global_lock = _manager.Lock()
|
|
|
|
# Create shared dictionaries with manager
|
|
|
|
_shared_dicts = _manager.dict()
|
|
|
|
_share_objects = _manager.dict()
|
|
|
|
_init_flags = _manager.dict() # 使用共享字典存储初始化标志
|
|
|
|
logger.info(f"Process {os.getpid()} storage data created for Multiple Process")
|
2025-02-26 18:11:02 +08:00
|
|
|
|
2025-02-27 08:48:33 +08:00
|
|
|
is_multiprocess = _is_multiprocess.value
|
2025-02-26 18:11:16 +08:00
|
|
|
|
2025-02-26 18:11:02 +08:00
|
|
|
def try_initialize_namespace(namespace: str) -> bool:
|
|
|
|
"""
|
|
|
|
尝试初始化命名空间。返回True表示当前进程获得了初始化权限。
|
|
|
|
使用共享字典的原子操作确保只有一个进程能成功初始化。
|
|
|
|
"""
|
|
|
|
global _init_flags, _manager
|
2025-02-26 18:11:16 +08:00
|
|
|
|
2025-02-27 08:48:33 +08:00
|
|
|
if _is_multiprocess.value:
|
2025-02-26 18:11:02 +08:00
|
|
|
if _init_flags is None:
|
2025-02-26 18:11:16 +08:00
|
|
|
raise RuntimeError(
|
|
|
|
"Shared storage not initialized. Call initialize_share_data() first."
|
|
|
|
)
|
2025-02-26 18:11:02 +08:00
|
|
|
else:
|
|
|
|
if _init_flags is None:
|
|
|
|
_init_flags = {}
|
2025-02-26 18:11:16 +08:00
|
|
|
|
2025-02-26 18:11:02 +08:00
|
|
|
logger.info(f"Process {os.getpid()} trying to initialize namespace {namespace}")
|
2025-02-26 18:11:16 +08:00
|
|
|
|
2025-02-27 08:48:33 +08:00
|
|
|
with _global_lock:
|
2025-02-26 18:11:02 +08:00
|
|
|
if namespace not in _init_flags:
|
|
|
|
_init_flags[namespace] = True
|
2025-02-26 18:11:16 +08:00
|
|
|
logger.info(
|
|
|
|
f"Process {os.getpid()} ready to initialize namespace {namespace}"
|
|
|
|
)
|
2025-02-26 18:11:02 +08:00
|
|
|
return True
|
2025-02-26 18:11:16 +08:00
|
|
|
logger.info(
|
|
|
|
f"Process {os.getpid()} found namespace {namespace} already initialized"
|
|
|
|
)
|
2025-02-26 18:11:02 +08:00
|
|
|
return False
|
2025-02-26 05:38:38 +08:00
|
|
|
|
2025-02-26 18:11:16 +08:00
|
|
|
|
2025-02-26 05:38:38 +08:00
|
|
|
def _get_global_lock() -> LockType:
|
|
|
|
return _global_lock
|
|
|
|
|
2025-02-26 18:11:16 +08:00
|
|
|
|
2025-02-26 05:38:38 +08:00
|
|
|
def get_storage_lock() -> LockType:
|
|
|
|
"""return storage lock for data consistency"""
|
|
|
|
return _get_global_lock()
|
|
|
|
|
2025-02-26 18:11:16 +08:00
|
|
|
|
2025-02-26 05:38:38 +08:00
|
|
|
def get_scan_lock() -> LockType:
|
|
|
|
"""return scan_progress lock for data consistency"""
|
|
|
|
return get_storage_lock()
|
|
|
|
|
2025-02-26 18:11:16 +08:00
|
|
|
|
2025-02-26 05:38:38 +08:00
|
|
|
def get_namespace_object(namespace: str) -> Any:
|
|
|
|
"""Get an object for specific namespace"""
|
2025-02-26 18:11:02 +08:00
|
|
|
|
|
|
|
if namespace not in _share_objects:
|
2025-02-26 05:38:38 +08:00
|
|
|
lock = _get_global_lock()
|
|
|
|
with lock:
|
2025-02-26 18:11:02 +08:00
|
|
|
if namespace not in _share_objects:
|
2025-02-27 08:48:33 +08:00
|
|
|
if _is_multiprocess.value:
|
2025-02-26 18:11:16 +08:00
|
|
|
_share_objects[namespace] = _manager.Value("O", None)
|
2025-02-26 05:38:38 +08:00
|
|
|
else:
|
2025-02-26 18:11:02 +08:00
|
|
|
_share_objects[namespace] = None
|
2025-02-26 18:11:16 +08:00
|
|
|
|
2025-02-26 18:11:02 +08:00
|
|
|
return _share_objects[namespace]
|
|
|
|
|
2025-02-26 05:38:38 +08:00
|
|
|
def get_namespace_data(namespace: str) -> Dict[str, Any]:
|
|
|
|
"""get storage space for specific storage type(namespace)"""
|
2025-02-26 18:11:02 +08:00
|
|
|
|
|
|
|
if namespace not in _shared_dicts:
|
|
|
|
lock = _get_global_lock()
|
2025-02-26 05:38:38 +08:00
|
|
|
with lock:
|
2025-02-26 18:11:02 +08:00
|
|
|
if namespace not in _shared_dicts:
|
|
|
|
_shared_dicts[namespace] = {}
|
2025-02-26 18:11:16 +08:00
|
|
|
|
2025-02-26 18:11:02 +08:00
|
|
|
return _shared_dicts[namespace]
|
2025-02-26 05:38:38 +08:00
|
|
|
|
2025-02-26 18:11:16 +08:00
|
|
|
|
2025-02-26 05:38:38 +08:00
|
|
|
def get_scan_progress() -> Dict[str, Any]:
|
|
|
|
"""get storage space for document scanning progress data"""
|
2025-02-26 18:11:16 +08:00
|
|
|
return get_namespace_data("scan_progress")
|