mirror of
				https://github.com/langgenius/dify.git
				synced 2025-10-31 19:03:09 +00:00 
			
		
		
		
	
		
			
	
	
		
			56 lines
		
	
	
		
			1.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
		
		
			
		
	
	
			56 lines
		
	
	
		
			1.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
|   | import logging | ||
|  | from typing import List, Optional | ||
|  | 
 | ||
|  | from langchain.document_loaders import PyPDFium2Loader | ||
|  | from langchain.document_loaders.base import BaseLoader | ||
|  | from langchain.schema import Document | ||
|  | 
 | ||
|  | from extensions.ext_storage import storage | ||
|  | from models.model import UploadFile | ||
|  | 
 | ||
|  | logger = logging.getLogger(__name__) | ||
|  | 
 | ||
|  | 
 | ||
|  | class PdfLoader(BaseLoader): | ||
|  |     """Load pdf files.
 | ||
|  | 
 | ||
|  | 
 | ||
|  |     Args: | ||
|  |         file_path: Path to the file to load. | ||
|  |     """
 | ||
|  | 
 | ||
|  |     def __init__( | ||
|  |         self, | ||
|  |         file_path: str, | ||
|  |         upload_file: Optional[UploadFile] = None | ||
|  |     ): | ||
|  |         """Initialize with file path.""" | ||
|  |         self._file_path = file_path | ||
|  |         self._upload_file = upload_file | ||
|  | 
 | ||
|  |     def load(self) -> List[Document]: | ||
|  |         plaintext_file_key = '' | ||
|  |         plaintext_file_exists = False | ||
|  |         if self._upload_file: | ||
|  |             if self._upload_file.hash: | ||
|  |                 plaintext_file_key = 'upload_files/' + self._upload_file.tenant_id + '/' \ | ||
|  |                                      + self._upload_file.hash + '.0625.plaintext' | ||
|  |                 try: | ||
|  |                     text = storage.load(plaintext_file_key).decode('utf-8') | ||
|  |                     plaintext_file_exists = True | ||
|  |                     return [Document(page_content=text)] | ||
|  |                 except FileNotFoundError: | ||
|  |                     pass | ||
|  |         documents = PyPDFium2Loader(file_path=self._file_path).load() | ||
|  |         text_list = [] | ||
|  |         for document in documents: | ||
|  |             text_list.append(document.page_content) | ||
|  |         text = "\n\n".join(text_list) | ||
|  | 
 | ||
|  |         # save plaintext file for caching | ||
|  |         if not plaintext_file_exists and plaintext_file_key: | ||
|  |             storage.save(plaintext_file_key, text.encode('utf-8')) | ||
|  | 
 | ||
|  |         return documents | ||
|  | 
 |