mirror of
				https://github.com/langgenius/dify.git
				synced 2025-10-31 02:42:59 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			45 lines
		
	
	
		
			1.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			45 lines
		
	
	
		
			1.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| """Document loader helpers."""
 | |
| 
 | |
| import concurrent.futures
 | |
| from pathlib import Path
 | |
| from typing import NamedTuple, Optional, cast
 | |
| 
 | |
| 
 | |
| class FileEncoding(NamedTuple):
 | |
|     """A file encoding as the NamedTuple."""
 | |
| 
 | |
|     encoding: Optional[str]
 | |
|     """The encoding of the file."""
 | |
|     confidence: float
 | |
|     """The confidence of the encoding."""
 | |
|     language: Optional[str]
 | |
|     """The language of the file."""
 | |
| 
 | |
| 
 | |
| def detect_file_encodings(file_path: str, timeout: int = 5) -> list[FileEncoding]:
 | |
|     """Try to detect the file encoding.
 | |
| 
 | |
|     Returns a list of `FileEncoding` tuples with the detected encodings ordered
 | |
|     by confidence.
 | |
| 
 | |
|     Args:
 | |
|         file_path: The path to the file to detect the encoding for.
 | |
|         timeout: The timeout in seconds for the encoding detection.
 | |
|     """
 | |
|     import chardet
 | |
| 
 | |
|     def read_and_detect(file_path: str) -> list[dict]:
 | |
|         rawdata = Path(file_path).read_bytes()
 | |
|         return cast(list[dict], chardet.detect_all(rawdata))
 | |
| 
 | |
|     with concurrent.futures.ThreadPoolExecutor() as executor:
 | |
|         future = executor.submit(read_and_detect, file_path)
 | |
|         try:
 | |
|             encodings = future.result(timeout=timeout)
 | |
|         except concurrent.futures.TimeoutError:
 | |
|             raise TimeoutError(f"Timeout reached while detecting encoding for {file_path}")
 | |
| 
 | |
|     if all(encoding["encoding"] is None for encoding in encodings):
 | |
|         raise RuntimeError(f"Could not detect encoding for {file_path}")
 | |
|     return [FileEncoding(**enc) for enc in encodings if enc["encoding"] is not None]
 | 
