mirror of
https://github.com/langgenius/dify.git
synced 2025-10-24 15:38:59 +00:00
36 lines
803 B
Python
36 lines
803 B
Python
![]() |
import logging
|
||
|
from typing import List
|
||
|
|
||
|
from bs4 import BeautifulSoup
|
||
|
from langchain.document_loaders.base import BaseLoader
|
||
|
from langchain.schema import Document
|
||
|
|
||
|
logger = logging.getLogger(__name__)
|
||
|
|
||
|
|
||
|
class HTMLLoader(BaseLoader):
|
||
|
"""Load html files.
|
||
|
|
||
|
|
||
|
Args:
|
||
|
file_path: Path to the file to load.
|
||
|
"""
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
file_path: str
|
||
|
):
|
||
|
"""Initialize with file path."""
|
||
|
self._file_path = file_path
|
||
|
|
||
|
def load(self) -> List[Document]:
|
||
|
return [Document(page_content=self._load_as_text())]
|
||
|
|
||
|
def _load_as_text(self) -> str:
|
||
|
with open(self._file_path, "rb") as fp:
|
||
|
soup = BeautifulSoup(fp, 'html.parser')
|
||
|
text = soup.get_text()
|
||
|
text = text.strip() if text else ''
|
||
|
|
||
|
return text
|