2025-02-26 18:11:02 +08:00
|
|
|
import os
|
2025-02-27 13:25:22 +08:00
|
|
|
import sys
|
2025-02-26 05:38:38 +08:00
|
|
|
from multiprocessing.synchronize import Lock as ProcessLock
|
|
|
|
from threading import Lock as ThreadLock
|
|
|
|
from multiprocessing import Manager
|
|
|
|
from typing import Any, Dict, Optional, Union
|
2025-02-27 19:05:51 +08:00
|
|
|
|
2025-02-26 05:38:38 +08:00
|
|
|
|
2025-02-27 13:25:22 +08:00
|
|
|
# Define a direct print function for critical logs that must be visible in all processes
|
|
|
|
def direct_log(message, level="INFO"):
|
|
|
|
"""
|
|
|
|
Log a message directly to stderr to ensure visibility in all processes,
|
|
|
|
including the Gunicorn master process.
|
2025-02-27 19:05:51 +08:00
|
|
|
"""
|
2025-02-27 13:25:22 +08:00
|
|
|
print(f"{level}: {message}", file=sys.stderr, flush=True)
|
|
|
|
|
2025-02-27 19:05:51 +08:00
|
|
|
|
2025-02-26 05:38:38 +08:00
|
|
|
LockType = Union[ProcessLock, ThreadLock]
|
|
|
|
|
2025-02-26 18:11:02 +08:00
|
|
|
_manager = None
|
2025-02-27 08:48:33 +08:00
|
|
|
_initialized = None
|
|
|
|
is_multiprocess = None
|
2025-02-28 01:14:25 +08:00
|
|
|
_global_lock: Optional[LockType] = None
|
2025-02-26 18:11:02 +08:00
|
|
|
|
|
|
|
# shared data for storage across processes
|
2025-02-27 08:48:33 +08:00
|
|
|
_shared_dicts: Optional[Dict[str, Any]] = None
|
2025-02-26 18:11:02 +08:00
|
|
|
_init_flags: Optional[Dict[str, bool]] = None # namespace -> initialized
|
|
|
|
|
2025-02-28 01:25:59 +08:00
|
|
|
|
2025-02-27 08:48:33 +08:00
|
|
|
def initialize_share_data(workers: int = 1):
|
2025-02-27 13:25:22 +08:00
|
|
|
"""
|
|
|
|
Initialize shared storage data for single or multi-process mode.
|
2025-02-27 19:05:51 +08:00
|
|
|
|
2025-02-27 13:25:22 +08:00
|
|
|
When used with Gunicorn's preload feature, this function is called once in the
|
|
|
|
master process before forking worker processes, allowing all workers to share
|
|
|
|
the same initialized data.
|
2025-02-27 19:05:51 +08:00
|
|
|
|
2025-02-27 13:25:22 +08:00
|
|
|
In single-process mode, this function is called during LightRAG object initialization.
|
2025-02-27 19:05:51 +08:00
|
|
|
|
2025-02-27 13:25:22 +08:00
|
|
|
The function determines whether to use cross-process shared variables for data storage
|
|
|
|
based on the number of workers. If workers=1, it uses thread locks and local dictionaries.
|
|
|
|
If workers>1, it uses process locks and shared dictionaries managed by multiprocessing.Manager.
|
2025-02-27 19:05:51 +08:00
|
|
|
|
2025-02-27 13:25:22 +08:00
|
|
|
Args:
|
|
|
|
workers (int): Number of worker processes. If 1, single-process mode is used.
|
|
|
|
If > 1, multi-process mode with shared memory is used.
|
|
|
|
"""
|
2025-02-27 19:05:51 +08:00
|
|
|
global \
|
|
|
|
_manager, \
|
|
|
|
is_multiprocess, \
|
|
|
|
is_multiprocess, \
|
|
|
|
_global_lock, \
|
|
|
|
_shared_dicts, \
|
|
|
|
_init_flags, \
|
|
|
|
_initialized
|
|
|
|
|
2025-02-27 13:25:22 +08:00
|
|
|
# Check if already initialized
|
2025-02-27 15:36:12 +08:00
|
|
|
if _initialized:
|
2025-02-27 19:05:51 +08:00
|
|
|
direct_log(
|
|
|
|
f"Process {os.getpid()} Shared-Data already initialized (multiprocess={is_multiprocess})"
|
|
|
|
)
|
2025-02-27 13:25:22 +08:00
|
|
|
return
|
2025-02-27 19:05:51 +08:00
|
|
|
|
2025-02-27 08:48:33 +08:00
|
|
|
_manager = Manager()
|
|
|
|
|
2025-02-27 13:25:22 +08:00
|
|
|
if workers > 1:
|
2025-02-27 15:36:12 +08:00
|
|
|
is_multiprocess = True
|
2025-02-27 19:05:51 +08:00
|
|
|
_global_lock = _manager.Lock()
|
2025-02-27 08:48:33 +08:00
|
|
|
_shared_dicts = _manager.dict()
|
2025-02-28 01:25:59 +08:00
|
|
|
_init_flags = _manager.dict()
|
2025-02-27 19:05:51 +08:00
|
|
|
direct_log(
|
|
|
|
f"Process {os.getpid()} Shared-Data created for Multiple Process (workers={workers})"
|
|
|
|
)
|
2025-02-27 13:25:22 +08:00
|
|
|
else:
|
2025-02-27 15:36:12 +08:00
|
|
|
is_multiprocess = False
|
2025-02-27 13:25:22 +08:00
|
|
|
_global_lock = ThreadLock()
|
|
|
|
_shared_dicts = {}
|
|
|
|
_init_flags = {}
|
2025-02-27 19:03:53 +08:00
|
|
|
direct_log(f"Process {os.getpid()} Shared-Data created for Single Process")
|
2025-02-26 18:11:02 +08:00
|
|
|
|
2025-02-27 13:25:22 +08:00
|
|
|
# Mark as initialized
|
2025-02-27 15:36:12 +08:00
|
|
|
_initialized = True
|
2025-02-28 21:46:45 +08:00
|
|
|
|
2025-02-28 11:52:42 +08:00
|
|
|
# Initialize pipeline status for document indexing control
|
|
|
|
pipeline_namespace = get_namespace_data("pipeline_status")
|
2025-02-28 21:46:45 +08:00
|
|
|
|
|
|
|
# Create a shared list object for history_messages
|
2025-02-28 13:53:40 +08:00
|
|
|
history_messages = _manager.list() if is_multiprocess else []
|
2025-02-28 21:46:45 +08:00
|
|
|
|
|
|
|
pipeline_namespace.update(
|
|
|
|
{
|
|
|
|
"busy": False, # Control concurrent processes
|
|
|
|
"job_name": "Default Job", # Current job name (indexing files/indexing texts)
|
|
|
|
"job_start": None, # Job start time
|
|
|
|
"docs": 0, # Total number of documents to be indexed
|
|
|
|
"batchs": 0, # Number of batches for processing documents
|
|
|
|
"cur_batch": 0, # Current processing batch
|
|
|
|
"request_pending": False, # Flag for pending request for processing
|
|
|
|
"latest_message": "", # Latest message from pipeline processing
|
|
|
|
"history_messages": history_messages, # 使用共享列表对象
|
|
|
|
}
|
|
|
|
)
|
2025-02-26 18:11:16 +08:00
|
|
|
|
2025-02-27 19:05:51 +08:00
|
|
|
|
2025-02-26 18:11:02 +08:00
|
|
|
def try_initialize_namespace(namespace: str) -> bool:
|
|
|
|
"""
|
2025-02-27 13:25:22 +08:00
|
|
|
Try to initialize a namespace. Returns True if the current process gets initialization permission.
|
|
|
|
Uses atomic operations on shared dictionaries to ensure only one process can successfully initialize.
|
2025-02-26 18:11:02 +08:00
|
|
|
"""
|
|
|
|
global _init_flags, _manager
|
2025-02-26 18:11:16 +08:00
|
|
|
|
2025-02-27 19:03:53 +08:00
|
|
|
if _init_flags is None:
|
2025-02-28 01:14:25 +08:00
|
|
|
raise ValueError("Try to create nanmespace before Shared-Data is initialized")
|
2025-02-26 18:11:16 +08:00
|
|
|
|
2025-02-27 19:03:53 +08:00
|
|
|
if namespace not in _init_flags:
|
|
|
|
_init_flags[namespace] = True
|
2025-02-28 01:25:59 +08:00
|
|
|
direct_log(
|
|
|
|
f"Process {os.getpid()} ready to initialize storage namespace: [{namespace}]"
|
|
|
|
)
|
2025-02-27 19:03:53 +08:00
|
|
|
return True
|
2025-02-28 01:25:59 +08:00
|
|
|
direct_log(
|
|
|
|
f"Process {os.getpid()} storage namespace already to initialized: [{namespace}]"
|
|
|
|
)
|
2025-02-27 19:03:53 +08:00
|
|
|
return False
|
2025-02-26 05:38:38 +08:00
|
|
|
|
2025-02-26 18:11:16 +08:00
|
|
|
|
2025-02-26 05:38:38 +08:00
|
|
|
def get_storage_lock() -> LockType:
|
|
|
|
"""return storage lock for data consistency"""
|
2025-02-28 01:14:25 +08:00
|
|
|
return _global_lock
|
2025-02-26 18:11:02 +08:00
|
|
|
|
2025-02-27 19:05:51 +08:00
|
|
|
|
2025-02-26 05:38:38 +08:00
|
|
|
def get_namespace_data(namespace: str) -> Dict[str, Any]:
|
|
|
|
"""get storage space for specific storage type(namespace)"""
|
2025-02-27 19:03:53 +08:00
|
|
|
if _shared_dicts is None:
|
2025-02-27 19:05:51 +08:00
|
|
|
direct_log(
|
|
|
|
f"Error: try to getnanmespace before Shared-Data is initialized, pid={os.getpid()}",
|
|
|
|
level="ERROR",
|
|
|
|
)
|
2025-02-27 19:03:53 +08:00
|
|
|
raise ValueError("Shared dictionaries not initialized")
|
|
|
|
|
2025-02-28 01:14:25 +08:00
|
|
|
lock = get_storage_lock()
|
2025-02-27 19:03:53 +08:00
|
|
|
with lock:
|
|
|
|
if namespace not in _shared_dicts:
|
|
|
|
if is_multiprocess and _manager is not None:
|
|
|
|
_shared_dicts[namespace] = _manager.dict()
|
|
|
|
else:
|
2025-02-26 18:11:02 +08:00
|
|
|
_shared_dicts[namespace] = {}
|
2025-02-27 19:05:51 +08:00
|
|
|
|
2025-02-26 18:11:02 +08:00
|
|
|
return _shared_dicts[namespace]
|
2025-02-26 05:38:38 +08:00
|
|
|
|
2025-02-26 18:11:16 +08:00
|
|
|
|
2025-02-27 13:25:22 +08:00
|
|
|
def finalize_share_data():
|
|
|
|
"""
|
|
|
|
Release shared resources and clean up.
|
2025-02-27 19:05:51 +08:00
|
|
|
|
2025-02-27 13:25:22 +08:00
|
|
|
This function should be called when the application is shutting down
|
|
|
|
to properly release shared resources and avoid memory leaks.
|
2025-02-27 19:05:51 +08:00
|
|
|
|
2025-02-27 13:25:22 +08:00
|
|
|
In multi-process mode, it shuts down the Manager and releases all shared objects.
|
|
|
|
In single-process mode, it simply resets the global variables.
|
|
|
|
"""
|
2025-02-27 19:05:51 +08:00
|
|
|
global \
|
|
|
|
_manager, \
|
|
|
|
is_multiprocess, \
|
|
|
|
_global_lock, \
|
|
|
|
_shared_dicts, \
|
|
|
|
_init_flags, \
|
|
|
|
_initialized
|
|
|
|
|
2025-02-27 13:25:22 +08:00
|
|
|
# Check if already initialized
|
2025-02-27 15:36:12 +08:00
|
|
|
if not _initialized:
|
2025-02-27 19:05:51 +08:00
|
|
|
direct_log(
|
|
|
|
f"Process {os.getpid()} storage data not initialized, nothing to finalize"
|
|
|
|
)
|
2025-02-27 13:25:22 +08:00
|
|
|
return
|
2025-02-27 19:05:51 +08:00
|
|
|
|
|
|
|
direct_log(
|
|
|
|
f"Process {os.getpid()} finalizing storage data (multiprocess={is_multiprocess})"
|
|
|
|
)
|
|
|
|
|
2025-02-27 13:25:22 +08:00
|
|
|
# In multi-process mode, shut down the Manager
|
2025-02-27 15:36:12 +08:00
|
|
|
if is_multiprocess and _manager is not None:
|
2025-02-27 13:25:22 +08:00
|
|
|
try:
|
|
|
|
# Clear shared dictionaries first
|
|
|
|
if _shared_dicts is not None:
|
|
|
|
_shared_dicts.clear()
|
|
|
|
if _init_flags is not None:
|
|
|
|
_init_flags.clear()
|
2025-02-27 19:05:51 +08:00
|
|
|
|
2025-02-27 13:25:22 +08:00
|
|
|
# Shut down the Manager
|
|
|
|
_manager.shutdown()
|
|
|
|
direct_log(f"Process {os.getpid()} Manager shutdown complete")
|
|
|
|
except Exception as e:
|
2025-02-27 19:05:51 +08:00
|
|
|
direct_log(
|
|
|
|
f"Process {os.getpid()} Error shutting down Manager: {e}", level="ERROR"
|
|
|
|
)
|
|
|
|
|
2025-02-27 13:25:22 +08:00
|
|
|
# Reset global variables
|
|
|
|
_manager = None
|
|
|
|
_initialized = None
|
|
|
|
is_multiprocess = None
|
|
|
|
_shared_dicts = None
|
|
|
|
_init_flags = None
|
|
|
|
_global_lock = None
|
2025-02-27 19:05:51 +08:00
|
|
|
|
2025-02-27 13:25:22 +08:00
|
|
|
direct_log(f"Process {os.getpid()} storage data finalization complete")
|