mirror of
https://github.com/run-llama/llama-hub.git
synced 2025-08-14 03:31:41 +00:00

* cr * cr * cr --------- Co-authored-by: Jerry Liu <jerry@robustintelligence.com> Co-authored-by: Jesse Zhang <jessetanzhang@gmail.com>
74 lines
1.9 KiB
Python
74 lines
1.9 KiB
Python
"""Rss reader."""
|
|
|
|
from typing import List
|
|
|
|
from llama_index.readers.base import BaseReader
|
|
from llama_index.readers.schema.base import Document
|
|
|
|
|
|
class RssReader(BaseReader):
|
|
"""RSS reader.
|
|
|
|
Reads content from an RSS feed.
|
|
|
|
"""
|
|
|
|
def __init__(self, html_to_text: bool = False) -> None:
|
|
"""Initialize with parameters.
|
|
|
|
Args:
|
|
html_to_text (bool): Whether to convert HTML to text.
|
|
Requires `html2text` package.
|
|
|
|
"""
|
|
try:
|
|
import feedparser # noqa: F401
|
|
except ImportError:
|
|
raise ValueError(
|
|
"`feedparser` package not found, please run `pip install feedparser`"
|
|
)
|
|
|
|
if html_to_text:
|
|
try:
|
|
import html2text # noqa: F401
|
|
except ImportError:
|
|
raise ValueError(
|
|
"`html2text` package not found, please run `pip install html2text`"
|
|
)
|
|
self._html_to_text = html_to_text
|
|
|
|
def load_data(self, urls: List[str]) -> List[Document]:
|
|
"""Load data from RSS feeds.
|
|
|
|
Args:
|
|
urls (List[str]): List of RSS URLs to load.
|
|
|
|
Returns:
|
|
List[Document]: List of documents.
|
|
|
|
"""
|
|
import feedparser
|
|
|
|
if not isinstance(urls, list):
|
|
raise ValueError("urls must be a list of strings.")
|
|
|
|
documents = []
|
|
|
|
for url in urls:
|
|
parsed = feedparser.parse(url)
|
|
for entry in parsed.entries:
|
|
if "content" in entry:
|
|
data = entry.content[0].value
|
|
else:
|
|
data = entry.description or entry.summary
|
|
|
|
if self._html_to_text:
|
|
import html2text
|
|
|
|
data = html2text.html2text(data)
|
|
|
|
extra_info = {"title": entry.title, "link": entry.link}
|
|
documents.append(Document(data, extra_info=extra_info))
|
|
|
|
return documents
|