mirror of
				https://github.com/langgenius/dify.git
				synced 2025-10-26 00:18:44 +00:00 
			
		
		
		
	
		
			
	
	
		
			36 lines
		
	
	
		
			803 B
		
	
	
	
		
			Python
		
	
	
	
	
	
		
		
			
		
	
	
			36 lines
		
	
	
		
			803 B
		
	
	
	
		
			Python
		
	
	
	
	
	
|   | import logging | ||
|  | from typing import List | ||
|  | 
 | ||
|  | from bs4 import BeautifulSoup | ||
|  | from langchain.document_loaders.base import BaseLoader | ||
|  | from langchain.schema import Document | ||
|  | 
 | ||
|  | logger = logging.getLogger(__name__) | ||
|  | 
 | ||
|  | 
 | ||
|  | class HTMLLoader(BaseLoader): | ||
|  |     """Load html files.
 | ||
|  | 
 | ||
|  | 
 | ||
|  |     Args: | ||
|  |         file_path: Path to the file to load. | ||
|  |     """
 | ||
|  | 
 | ||
|  |     def __init__( | ||
|  |         self, | ||
|  |         file_path: str | ||
|  |     ): | ||
|  |         """Initialize with file path.""" | ||
|  |         self._file_path = file_path | ||
|  | 
 | ||
|  |     def load(self) -> List[Document]: | ||
|  |         return [Document(page_content=self._load_as_text())] | ||
|  | 
 | ||
|  |     def _load_as_text(self) -> str: | ||
|  |         with open(self._file_path, "rb") as fp: | ||
|  |             soup = BeautifulSoup(fp, 'html.parser') | ||
|  |             text = soup.get_text() | ||
|  |             text = text.strip() if text else '' | ||
|  | 
 | ||
|  |         return text |