mirror of
https://github.com/HKUDS/LightRAG.git
synced 2025-06-26 22:00:19 +00:00
Add Unicode collation for Chinese file sorting of document scanning
This commit is contained in:
parent
df1d6b31ed
commit
31bd274601
@ -16,6 +16,7 @@ python-dotenv
|
||||
python-jose[cryptography]
|
||||
python-multipart
|
||||
pytz
|
||||
pyuca
|
||||
tenacity
|
||||
tiktoken
|
||||
uvicorn
|
||||
|
@ -3,6 +3,7 @@ This module contains all document-related routes for the LightRAG API.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from pyuca import Collator
|
||||
from lightrag.utils import logger
|
||||
import aiofiles
|
||||
import shutil
|
||||
@ -614,8 +615,12 @@ async def pipeline_index_files(rag: LightRAG, file_paths: List[Path]):
|
||||
try:
|
||||
enqueued = False
|
||||
|
||||
# Create Collator for Unicode sorting
|
||||
collator = Collator()
|
||||
sorted_file_paths = sorted(file_paths, key=lambda p: collator.sort_key(str(p)))
|
||||
|
||||
# Process files sequentially
|
||||
for file_path in file_paths:
|
||||
for file_path in sorted_file_paths:
|
||||
if await pipeline_enqueue_file(rag, file_path):
|
||||
enqueued = True
|
||||
|
||||
|
@ -11,6 +11,9 @@ pipmaster
|
||||
pydantic
|
||||
python-dotenv
|
||||
|
||||
# Unicode Collation Algorithm for proper Chinese sorting
|
||||
pyuca
|
||||
|
||||
setuptools
|
||||
tenacity
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user