Add Unicode collation for Chinese file sorting of document scanning

This commit is contained in:
yangdx 2025-04-25 01:02:09 +08:00
parent df1d6b31ed
commit 31bd274601
3 changed files with 10 additions and 1 deletions

View File

@ -16,6 +16,7 @@ python-dotenv
python-jose[cryptography] python-jose[cryptography]
python-multipart python-multipart
pytz pytz
pyuca
tenacity tenacity
tiktoken tiktoken
uvicorn uvicorn

View File

@ -3,6 +3,7 @@ This module contains all document-related routes for the LightRAG API.
""" """
import asyncio import asyncio
from pyuca import Collator
from lightrag.utils import logger from lightrag.utils import logger
import aiofiles import aiofiles
import shutil import shutil
@ -614,8 +615,12 @@ async def pipeline_index_files(rag: LightRAG, file_paths: List[Path]):
try: try:
enqueued = False enqueued = False
# Create Collator for Unicode sorting
collator = Collator()
sorted_file_paths = sorted(file_paths, key=lambda p: collator.sort_key(str(p)))
# Process files sequentially # Process files sequentially
for file_path in file_paths: for file_path in sorted_file_paths:
if await pipeline_enqueue_file(rag, file_path): if await pipeline_enqueue_file(rag, file_path):
enqueued = True enqueued = True

View File

@ -11,6 +11,9 @@ pipmaster
pydantic pydantic
python-dotenv python-dotenv
# Unicode Collation Algorithm for proper Chinese sorting
pyuca
setuptools setuptools
tenacity tenacity