2023-04-05 22:36:21 -07:00

48 lines
1.2 KiB
Python

"""Simple Web scraper."""
from typing import List
from langchain.requests import RequestsWrapper
from llama_index.readers.base import BaseReader
from llama_index.readers.schema.base import Document
class SimpleWebPageReader(BaseReader):
"""Simple web page reader.
Reads pages from the web.
Args:
html_to_text (bool): Whether to convert HTML to text.
Requires `html2text` package.
"""
def __init__(self, html_to_text: bool = False) -> None:
"""Initialize with parameters."""
self._html_to_text = html_to_text
def load_data(self, urls: List[str]) -> List[Document]:
"""Load data from the input directory.
Args:
urls (List[str]): List of URLs to scrape.
Returns:
List[Document]: List of documents.
"""
if not isinstance(urls, list):
raise ValueError("urls must be a list of strings.")
requests = RequestsWrapper()
documents = []
for url in urls:
response = requests.get(url)
if self._html_to_text:
import html2text
response = html2text.html2text(response)
documents.append(Document(response))
return documents