|
from typing import Dict, List |
|
|
|
import pandas as pd |
|
|
|
from utilities.my_logger import setup_logger |
|
|
|
|
|
logger = setup_logger(__name__) |
|
|
|
|
|
def preprocess_praw_data(submissions: List[Dict]) -> pd.DataFrame: |
|
""" |
|
Preprocesses praw data into a DataFrame. |
|
|
|
Parameters: |
|
- submissions: List of submission dictionaries. |
|
|
|
Returns: |
|
- pd.DataFrame: Preprocessed DataFrame. |
|
""" |
|
|
|
|
|
praw_df = pd.DataFrame(submissions) |
|
|
|
|
|
praw_df.date_utc = pd.to_datetime(praw_df.date_utc) |
|
|
|
|
|
if 'poster_link' in praw_df.columns: |
|
del praw_df['poster_link'] |
|
|
|
|
|
praw_df['id'] = praw_df.permalink.str.split('/').str[4] |
|
|
|
return praw_df |
|
|
|
def preprocess_praw_comment_data(comments: List[Dict]) -> pd.DataFrame: |
|
""" |
|
Preprocesses praw comment data into a DataFrame. |
|
|
|
Parameters: |
|
- submissions: List of submission dictionaries. |
|
|
|
Returns: |
|
- pd.DataFrame: Preprocessed DataFrame. |
|
""" |
|
|
|
|
|
praw_df = pd.DataFrame(comments) |
|
|
|
|
|
praw_df.date_utc = pd.to_datetime(praw_df.date_utc) |
|
|
|
return praw_df |
|
|