Add Unicode collation for Chinese file sorting of document scanning

2025-06-26 22:00:19 +00:00 · 2025-04-25 01:02:09 +08:00 · 2025-04-25 01:02:09 +08:00 · 31bd274601
commit 31bd274601
parent df1d6b31ed
3 changed files with 10 additions and 1 deletions
--- a/lightrag/api/requirements.txt
+++ b/lightrag/api/requirements.txt
@ -16,6 +16,7 @@ python-dotenv
 python-jose[cryptography]
 python-multipart
 pytz
+pyuca
 tenacity
 tiktoken
 uvicorn
--- a/lightrag/api/routers/document_routes.py
+++ b/lightrag/api/routers/document_routes.py
@ -3,6 +3,7 @@ This module contains all document-related routes for the LightRAG API.
 """

 import asyncio
+from pyuca import Collator
 from lightrag.utils import logger
 import aiofiles
 import shutil
@ -614,8 +615,12 @@ async def pipeline_index_files(rag: LightRAG, file_paths: List[Path]):
    try:
        enqueued = False

+        # Create Collator for Unicode sorting
+        collator = Collator()
+        sorted_file_paths = sorted(file_paths, key=lambda p: collator.sort_key(str(p)))
+
        # Process files sequentially
-        for file_path in file_paths:
+        for file_path in sorted_file_paths:
            if await pipeline_enqueue_file(rag, file_path):
                enqueued = True

--- a/requirements.txt
+++ b/requirements.txt
@ -11,6 +11,9 @@ pipmaster
 pydantic
 python-dotenv

+# Unicode Collation Algorithm for proper Chinese sorting
+pyuca
+
 setuptools
 tenacity