Add Unicode collation for Chinese file sorting of document scanning

This commit is contained in:
yangdx 2025-04-25 01:02:09 +08:00
parent df1d6b31ed
commit 31bd274601
3 changed files with 10 additions and 1 deletions

View File

@ -16,6 +16,7 @@ python-dotenv
python-jose[cryptography]
python-multipart
pytz
pyuca
tenacity
tiktoken
uvicorn

View File

@ -3,6 +3,7 @@ This module contains all document-related routes for the LightRAG API.
"""
import asyncio
from pyuca import Collator
from lightrag.utils import logger
import aiofiles
import shutil
@ -614,8 +615,12 @@ async def pipeline_index_files(rag: LightRAG, file_paths: List[Path]):
try:
enqueued = False
# Create Collator for Unicode sorting
collator = Collator()
sorted_file_paths = sorted(file_paths, key=lambda p: collator.sort_key(str(p)))
# Process files sequentially
for file_path in file_paths:
for file_path in sorted_file_paths:
if await pipeline_enqueue_file(rag, file_path):
enqueued = True

View File

@ -11,6 +11,9 @@ pipmaster
pydantic
python-dotenv
# Unicode Collation Algorithm for proper Chinese sorting
pyuca
setuptools
tenacity