58 lines
1.9 KiB
Python
Raw Normal View History

"""Simple Reader that loads text relevant to a certain search keyword from subreddits"""
2023-02-24 23:39:32 -08:00
from typing import List, Optional
from llama_index.readers.base import BaseReader
from llama_index.readers.schema.base import Document
class RedditReader(BaseReader):
"""
Subreddit post and top-level comments reader for Reddit
"""
2023-02-24 23:39:32 -08:00
def load_data(
self,
subreddits: List[str],
search_keys: List[str],
post_limit: Optional[int] = [10],
) -> List[Document]:
"""
Load text from relevant posts and top-level comments in subreddit(s), given keyword(s) for search
Args:
subreddits (List[str]): List of subreddits you'd like to read from
search_keys (List[str]): List of keywords you'd like to use to search from subreddit(s)
post_limit (Optional[int]): Maximum number of posts per subreddit you'd like to read from, defaults to 10
2023-02-24 23:39:32 -08:00
"""
2023-02-24 23:39:32 -08:00
import os
2023-02-24 23:39:32 -08:00
import praw
2023-02-21 22:04:34 -05:00
from praw.models import MoreComments
reddit = praw.Reddit(
2023-02-24 23:39:32 -08:00
client_id=os.getenv("REDDIT_CLIENT_ID"),
client_secret=os.getenv("REDDIT_CLIENT_SECRET"),
user_agent=os.getenv("REDDIT_USER_AGENT"),
username=os.getenv("REDDIT_USERNAME"),
password=os.getenv("REDDIT_PASSWORD"),
)
posts = []
for sr in subreddits:
ml_subreddit = reddit.subreddit(sr)
for kw in search_keys:
2023-02-24 23:39:32 -08:00
relevant_posts = ml_subreddit.search(kw, limit=post_limit)
for post in relevant_posts:
posts.append(Document(post.selftext))
for top_level_comment in post.comments:
if isinstance(top_level_comment, MoreComments):
continue
posts.append(Document(top_level_comment.body))
return posts