LightRAG/lightrag/kg/shared_storage.py

129 lines
4.1 KiB
Python
Raw Normal View History

import os
from multiprocessing.synchronize import Lock as ProcessLock
from threading import Lock as ThreadLock
from multiprocessing import Manager
from typing import Any, Dict, Optional, Union
from lightrag.utils import logger
LockType = Union[ProcessLock, ThreadLock]
_manager = None
_initialized = None
_is_multiprocess = None
is_multiprocess = None
# shared data for storage across processes
_shared_dicts: Optional[Dict[str, Any]] = None
_share_objects: Optional[Dict[str, Any]] = None
_init_flags: Optional[Dict[str, bool]] = None # namespace -> initialized
_global_lock: Optional[LockType] = None
2025-02-26 18:11:16 +08:00
def initialize_share_data(workers: int = 1):
"""Initialize storage data"""
global _manager, _is_multiprocess, is_multiprocess, _global_lock, _shared_dicts, _share_objects, _init_flags, _initialized
if _initialized and _initialized.value:
is_multiprocess = _is_multiprocess.value
if _is_multiprocess.value:
logger.info(f"Process {os.getpid()} storage data already initialized!")
return
_manager = Manager()
_initialized = _manager.Value("b", False)
_is_multiprocess = _manager.Value("b", False)
if workers == 1:
_is_multiprocess.value = False
_global_lock = ThreadLock()
_shared_dicts = {}
_share_objects = {}
_init_flags = {}
logger.info(f"Process {os.getpid()} storage data created for Single Process")
else:
_is_multiprocess.value = True
_global_lock = _manager.Lock()
# Create shared dictionaries with manager
_shared_dicts = _manager.dict()
_share_objects = _manager.dict()
_init_flags = _manager.dict() # 使用共享字典存储初始化标志
logger.info(f"Process {os.getpid()} storage data created for Multiple Process")
is_multiprocess = _is_multiprocess.value
2025-02-26 18:11:16 +08:00
def try_initialize_namespace(namespace: str) -> bool:
"""
尝试初始化命名空间返回True表示当前进程获得了初始化权限
使用共享字典的原子操作确保只有一个进程能成功初始化
"""
global _init_flags, _manager
2025-02-26 18:11:16 +08:00
if _is_multiprocess.value:
if _init_flags is None:
2025-02-26 18:11:16 +08:00
raise RuntimeError(
"Shared storage not initialized. Call initialize_share_data() first."
)
else:
if _init_flags is None:
_init_flags = {}
2025-02-26 18:11:16 +08:00
logger.info(f"Process {os.getpid()} trying to initialize namespace {namespace}")
2025-02-26 18:11:16 +08:00
with _global_lock:
if namespace not in _init_flags:
_init_flags[namespace] = True
2025-02-26 18:11:16 +08:00
logger.info(
f"Process {os.getpid()} ready to initialize namespace {namespace}"
)
return True
2025-02-26 18:11:16 +08:00
logger.info(
f"Process {os.getpid()} found namespace {namespace} already initialized"
)
return False
2025-02-26 18:11:16 +08:00
def _get_global_lock() -> LockType:
return _global_lock
2025-02-26 18:11:16 +08:00
def get_storage_lock() -> LockType:
"""return storage lock for data consistency"""
return _get_global_lock()
2025-02-26 18:11:16 +08:00
def get_scan_lock() -> LockType:
"""return scan_progress lock for data consistency"""
return get_storage_lock()
2025-02-26 18:11:16 +08:00
def get_namespace_object(namespace: str) -> Any:
"""Get an object for specific namespace"""
if namespace not in _share_objects:
lock = _get_global_lock()
with lock:
if namespace not in _share_objects:
if _is_multiprocess.value:
2025-02-26 18:11:16 +08:00
_share_objects[namespace] = _manager.Value("O", None)
else:
_share_objects[namespace] = None
2025-02-26 18:11:16 +08:00
return _share_objects[namespace]
def get_namespace_data(namespace: str) -> Dict[str, Any]:
"""get storage space for specific storage type(namespace)"""
if namespace not in _shared_dicts:
lock = _get_global_lock()
with lock:
if namespace not in _shared_dicts:
_shared_dicts[namespace] = {}
2025-02-26 18:11:16 +08:00
return _shared_dicts[namespace]
2025-02-26 18:11:16 +08:00
def get_scan_progress() -> Dict[str, Any]:
"""get storage space for document scanning progress data"""
2025-02-26 18:11:16 +08:00
return get_namespace_data("scan_progress")