diff --git a/lightrag/api/requirements.txt b/lightrag/api/requirements.txt index afd76f5b..995e2bc1 100644 --- a/lightrag/api/requirements.txt +++ b/lightrag/api/requirements.txt @@ -16,6 +16,7 @@ python-dotenv python-jose[cryptography] python-multipart pytz +pyuca tenacity tiktoken uvicorn diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py index d4421cf6..823d7bff 100644 --- a/lightrag/api/routers/document_routes.py +++ b/lightrag/api/routers/document_routes.py @@ -3,6 +3,7 @@ This module contains all document-related routes for the LightRAG API. """ import asyncio +from pyuca import Collator from lightrag.utils import logger import aiofiles import shutil @@ -614,8 +615,12 @@ async def pipeline_index_files(rag: LightRAG, file_paths: List[Path]): try: enqueued = False + # Create Collator for Unicode sorting + collator = Collator() + sorted_file_paths = sorted(file_paths, key=lambda p: collator.sort_key(str(p))) + # Process files sequentially - for file_path in file_paths: + for file_path in sorted_file_paths: if await pipeline_enqueue_file(rag, file_path): enqueued = True diff --git a/requirements.txt b/requirements.txt index e247185c..3161337a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,6 +11,9 @@ pipmaster pydantic python-dotenv +# Unicode Collation Algorithm for proper Chinese sorting +pyuca + setuptools tenacity