mirror of
https://github.com/HKUDS/LightRAG.git
synced 2025-06-26 22:00:19 +00:00
Add Unicode collation for Chinese file sorting of document scanning
This commit is contained in:
parent
df1d6b31ed
commit
31bd274601
@ -16,6 +16,7 @@ python-dotenv
|
|||||||
python-jose[cryptography]
|
python-jose[cryptography]
|
||||||
python-multipart
|
python-multipart
|
||||||
pytz
|
pytz
|
||||||
|
pyuca
|
||||||
tenacity
|
tenacity
|
||||||
tiktoken
|
tiktoken
|
||||||
uvicorn
|
uvicorn
|
||||||
|
@ -3,6 +3,7 @@ This module contains all document-related routes for the LightRAG API.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
|
from pyuca import Collator
|
||||||
from lightrag.utils import logger
|
from lightrag.utils import logger
|
||||||
import aiofiles
|
import aiofiles
|
||||||
import shutil
|
import shutil
|
||||||
@ -614,8 +615,12 @@ async def pipeline_index_files(rag: LightRAG, file_paths: List[Path]):
|
|||||||
try:
|
try:
|
||||||
enqueued = False
|
enqueued = False
|
||||||
|
|
||||||
|
# Create Collator for Unicode sorting
|
||||||
|
collator = Collator()
|
||||||
|
sorted_file_paths = sorted(file_paths, key=lambda p: collator.sort_key(str(p)))
|
||||||
|
|
||||||
# Process files sequentially
|
# Process files sequentially
|
||||||
for file_path in file_paths:
|
for file_path in sorted_file_paths:
|
||||||
if await pipeline_enqueue_file(rag, file_path):
|
if await pipeline_enqueue_file(rag, file_path):
|
||||||
enqueued = True
|
enqueued = True
|
||||||
|
|
||||||
|
@ -11,6 +11,9 @@ pipmaster
|
|||||||
pydantic
|
pydantic
|
||||||
python-dotenv
|
python-dotenv
|
||||||
|
|
||||||
|
# Unicode Collation Algorithm for proper Chinese sorting
|
||||||
|
pyuca
|
||||||
|
|
||||||
setuptools
|
setuptools
|
||||||
tenacity
|
tenacity
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user