Refactor: move build_file_path function from operate.py to utils.py

This commit is contained in:
yangdx 2025-07-26 10:52:59 +08:00
parent c8c3545454
commit 7b915b34f6
2 changed files with 54 additions and 42 deletions

View File

@ -28,6 +28,7 @@ from .utils import (
remove_think_tags,
linear_gradient_weighted_polling,
process_chunks_unified,
build_file_path,
)
from .base import (
BaseGraphStorage,
@ -43,7 +44,6 @@ from .constants import (
DEFAULT_MAX_RELATION_TOKENS,
DEFAULT_MAX_TOTAL_TOKENS,
DEFAULT_RELATED_CHUNK_NUMBER,
DEFAULT_MAX_FILE_PATH_LENGTH,
)
from .kg.shared_storage import get_storage_keyed_lock
import time
@ -3133,47 +3133,6 @@ async def kg_query_with_keywords(
return response
def build_file_path(already_file_paths, data_list, target):
# set: deduplication
file_paths_set = {fp for fp in already_file_paths if fp}
# string: filter empty value and keep file order in already_file_paths
file_paths = GRAPH_FIELD_SEP.join(fp for fp in already_file_paths if fp)
# ignored file_paths
file_paths_ignore = ""
# add file_paths
for dp in data_list:
cur_file_path = dp.get("file_path")
# empty
if not cur_file_path:
continue
# skip duplicate item
if cur_file_path in file_paths_set:
continue
# add
file_paths_set.add(cur_file_path)
# check the length
if (
len(file_paths) + len(GRAPH_FIELD_SEP + cur_file_path)
< DEFAULT_MAX_FILE_PATH_LENGTH
):
# append
file_paths += (
GRAPH_FIELD_SEP + cur_file_path if file_paths else cur_file_path
)
else:
# ignore
file_paths_ignore += GRAPH_FIELD_SEP + cur_file_path
if file_paths_ignore:
logger.warning(
f"Length of file_path exceeds {target}, ignoring new file: {file_paths_ignore}"
)
return file_paths
# TODO: Deprecated, use user_prompt in QueryParam instead
async def query_with_keywords(
query: str,

View File

@ -19,6 +19,8 @@ from lightrag.constants import (
DEFAULT_LOG_MAX_BYTES,
DEFAULT_LOG_BACKUP_COUNT,
DEFAULT_LOG_FILENAME,
GRAPH_FIELD_SEP,
DEFAULT_MAX_FILE_PATH_LENGTH,
)
@ -1901,3 +1903,54 @@ async def process_chunks_unified(
)
return unique_chunks
def build_file_path(already_file_paths, data_list, target):
"""Build file path string with length limit and deduplication
Args:
already_file_paths: List of existing file paths
data_list: List of data items containing file_path
target: Target name for logging warnings
Returns:
str: Combined file paths separated by GRAPH_FIELD_SEP
"""
# set: deduplication
file_paths_set = {fp for fp in already_file_paths if fp}
# string: filter empty value and keep file order in already_file_paths
file_paths = GRAPH_FIELD_SEP.join(fp for fp in already_file_paths if fp)
# ignored file_paths
file_paths_ignore = ""
# add file_paths
for dp in data_list:
cur_file_path = dp.get("file_path")
# empty
if not cur_file_path:
continue
# skip duplicate item
if cur_file_path in file_paths_set:
continue
# add
file_paths_set.add(cur_file_path)
# check the length
if (
len(file_paths) + len(GRAPH_FIELD_SEP + cur_file_path)
< DEFAULT_MAX_FILE_PATH_LENGTH
):
# append
file_paths += (
GRAPH_FIELD_SEP + cur_file_path if file_paths else cur_file_path
)
else:
# ignore
file_paths_ignore += GRAPH_FIELD_SEP + cur_file_path
if file_paths_ignore:
logger.warning(
f"Length of file_path exceeds {target}, ignoring new file: {file_paths_ignore}"
)
return file_paths