A2D / data_cleaning.py
Abu1998's picture
Create data_cleaning.py
6f75600 verified
raw
history blame
1.03 kB
import pandas as pd
def clean_data(df):
# Remove 'http://www.youtube.com/@' from the 'Channel' column
df['Channel'] = df['Channel'].str.replace('http://www.youtube.com/@', '', regex=False)
# Fill missing values in 'Comment', 'CommentedUserID', and 'ToWhomTheyReplied'
df['Comment'] = df['Comment'].fillna(method='ffill')
df['CommentedUserID'] = df['CommentedUserID'].fillna(method='ffill')
df['ToWhomTheyReplied'] = df['ToWhomTheyReplied'].fillna(df['Channel'])
# Drop rows where 'Reply' column is missing
before = df.shape[0]
df.dropna(subset=['Reply'], inplace=True)
after = df.shape[0]
# Calculate comment and reply lengths
df['comment_length'] = df['Comment'].str.len()
df['reply_length'] = df['Reply'].str.len()
# Remove duplicate rows
num_duplicates = df.duplicated().sum()
df_deduplicated = df.drop_duplicates()
# Print number of duplicates
print('Number of duplicate rows:', num_duplicates)
return df_deduplicated