| 
									
										
										
										
											2024-02-22 23:31:57 +08:00
										 |  |  | """Document loader helpers.""" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | import concurrent.futures | 
					
						
							| 
									
										
										
										
											2024-09-12 15:50:49 +08:00
										 |  |  | from pathlib import Path | 
					
						
							| 
									
										
										
										
											2024-02-22 23:31:57 +08:00
										 |  |  | from typing import NamedTuple, Optional, cast | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class FileEncoding(NamedTuple): | 
					
						
							|  |  |  |     """A file encoding as the NamedTuple.""" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     encoding: Optional[str] | 
					
						
							|  |  |  |     """The encoding of the file.""" | 
					
						
							|  |  |  |     confidence: float | 
					
						
							|  |  |  |     """The confidence of the encoding.""" | 
					
						
							|  |  |  |     language: Optional[str] | 
					
						
							|  |  |  |     """The language of the file.""" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def detect_file_encodings(file_path: str, timeout: int = 5) -> list[FileEncoding]: | 
					
						
							|  |  |  |     """Try to detect the file encoding.
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     Returns a list of `FileEncoding` tuples with the detected encodings ordered | 
					
						
							|  |  |  |     by confidence. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     Args: | 
					
						
							|  |  |  |         file_path: The path to the file to detect the encoding for. | 
					
						
							|  |  |  |         timeout: The timeout in seconds for the encoding detection. | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     import chardet | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def read_and_detect(file_path: str) -> list[dict]: | 
					
						
							| 
									
										
										
										
											2024-09-12 15:50:49 +08:00
										 |  |  |         rawdata = Path(file_path).read_bytes() | 
					
						
							| 
									
										
										
										
											2024-02-22 23:31:57 +08:00
										 |  |  |         return cast(list[dict], chardet.detect_all(rawdata)) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     with concurrent.futures.ThreadPoolExecutor() as executor: | 
					
						
							|  |  |  |         future = executor.submit(read_and_detect, file_path) | 
					
						
							|  |  |  |         try: | 
					
						
							|  |  |  |             encodings = future.result(timeout=timeout) | 
					
						
							|  |  |  |         except concurrent.futures.TimeoutError: | 
					
						
							| 
									
										
										
										
											2024-09-10 17:00:20 +08:00
										 |  |  |             raise TimeoutError(f"Timeout reached while detecting encoding for {file_path}") | 
					
						
							| 
									
										
										
										
											2024-02-22 23:31:57 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     if all(encoding["encoding"] is None for encoding in encodings): | 
					
						
							|  |  |  |         raise RuntimeError(f"Could not detect encoding for {file_path}") | 
					
						
							|  |  |  |     return [FileEncoding(**enc) for enc in encodings if enc["encoding"] is not None] |