import os from datetime import datetime from typing import Any, Dict, List, Tuple import praw from utilities.my_logger import setup_logger # Setup logging logger = setup_logger(__name__) # Get subreddit subreddit_var = os.getenv("SUBREDDIT") reddit_pull_limit = int(os.getenv("REDDIT_PULL_LIMIT")) def get_reddit_instance() -> praw.Reddit: """Initialize and return a Reddit instance using PRAW.""" return praw.Reddit( client_id=os.getenv('REDDIT_CLIENT_ID'), client_secret=os.getenv('REDDIT_CLIENT_SECRET'), user_agent=os.getenv('REDDIT_USER_AGENT'), ratelimit_seconds=20, ) def extract_submission_data(submission: praw.models.Submission) -> Dict[str, Any]: """Extract and return relevant data from a given Reddit submission.""" return { "content": submission.selftext, "poster": str(submission.author), "date_utc": datetime.utcfromtimestamp(submission.created_utc).strftime('%Y-%m-%d %H:%M:%S'), "flair": submission.link_flair_text, "title": submission.title, "score": submission.ups, "permalink": submission.permalink, "nsfw": submission.over_18, } def extract_comment_data(comment: praw.models.Comment) -> Dict[str, Any]: """Extract and return relevant data from a given Reddit comment""" return { 'content': comment.body, 'poster': str(comment.author), 'date_utc': datetime.utcfromtimestamp(comment.created_utc).strftime('%Y-%m-%d %H:%M:%S'), 'flair': comment.author_flair_text, 'ups': comment.ups, 'score': comment.score, 'permalink': comment.permalink, 'depth': comment.depth, 'link_id': comment.link_id, 'parent_id': comment.parent_id, 'id': comment.id } def praw_downloader() -> Tuple[List[Dict[str, str]]]: """Main function to extract and save all submissions from the subreddit.""" reddit = get_reddit_instance() subreddit = reddit.subreddit(subreddit_var) logger.info(f'Starting to fetch submissions from {os.getenv("SUBREDDIT")}.') submissions = [] comments_list = [] for submission in subreddit.new(limit=reddit_pull_limit): # Set limit=None to get all posts # logger.debug(f'Processing post {submission.id} - {submission.title}') data = extract_submission_data(submission) while True: comments = submission.comments print('last comment') try: print(comments[-1]) print('Replace more and show last') print(comments.replace_more()[-1]) print('Opening more comments...') except: print('Opened all comments for post: ', submission.title) comments = comments.list() break for comment in comments: comments_list.append(extract_comment_data(comment)) submissions.append(data) logger.info(f'Finished downloading {len(submissions)} submissions, {len(comments_list)} comments') return submissions, comments_list if __name__ == "__main__": praw_downloader()