| 
									
										
										
										
											2023-06-25 16:49:14 +08:00
										 |  |  | import json | 
					
						
							|  |  |  | import logging | 
					
						
							|  |  |  | from typing import List | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | from langchain.document_loaders.base import BaseLoader | 
					
						
							|  |  |  | from langchain.schema import Document | 
					
						
							|  |  |  | from openpyxl.reader.excel import load_workbook | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | logger = logging.getLogger(__name__) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class ExcelLoader(BaseLoader): | 
					
						
							|  |  |  |     """Load xlxs files.
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     Args: | 
					
						
							|  |  |  |         file_path: Path to the file to load. | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def __init__( | 
					
						
							|  |  |  |         self, | 
					
						
							|  |  |  |         file_path: str | 
					
						
							|  |  |  |     ): | 
					
						
							|  |  |  |         """Initialize with file path.""" | 
					
						
							|  |  |  |         self._file_path = file_path | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def load(self) -> List[Document]: | 
					
						
							|  |  |  |         data = [] | 
					
						
							|  |  |  |         keys = [] | 
					
						
							|  |  |  |         wb = load_workbook(filename=self._file_path, read_only=True) | 
					
						
							|  |  |  |         # loop over all sheets | 
					
						
							|  |  |  |         for sheet in wb: | 
					
						
							| 
									
										
										
										
											2023-08-29 10:36:48 +08:00
										 |  |  |             if 'A1:A1' == sheet.calculate_dimension(): | 
					
						
							|  |  |  |                 sheet.reset_dimensions() | 
					
						
							| 
									
										
										
										
											2023-06-25 16:49:14 +08:00
										 |  |  |             for row in sheet.iter_rows(values_only=True): | 
					
						
							|  |  |  |                 if all(v is None for v in row): | 
					
						
							|  |  |  |                     continue | 
					
						
							|  |  |  |                 if keys == []: | 
					
						
							|  |  |  |                     keys = list(map(str, row)) | 
					
						
							|  |  |  |                 else: | 
					
						
							| 
									
										
										
										
											2023-06-27 17:15:03 +08:00
										 |  |  |                     row_dict = dict(zip(keys, list(map(str, row)))) | 
					
						
							| 
									
										
										
										
											2023-06-25 16:49:14 +08:00
										 |  |  |                     row_dict = {k: v for k, v in row_dict.items() if v} | 
					
						
							| 
									
										
										
										
											2023-08-24 21:27:31 +08:00
										 |  |  |                     item = ''.join(f'{k}:{v};' for k, v in row_dict.items()) | 
					
						
							| 
									
										
										
										
											2023-07-28 20:47:15 +08:00
										 |  |  |                     document = Document(page_content=item, metadata={'source': self._file_path}) | 
					
						
							| 
									
										
										
										
											2023-06-28 13:58:50 +08:00
										 |  |  |                     data.append(document) | 
					
						
							| 
									
										
										
										
											2023-06-25 16:49:14 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-06-28 13:58:50 +08:00
										 |  |  |         return data |