Spaces:
Sleeping
Sleeping
import pandas as pd | |
def clean_data(df): | |
# Remove 'http://www.youtube.com/@' from the 'Channel' column | |
df['Channel'] = df['Channel'].str.replace('http://www.youtube.com/@', '', regex=False) | |
# Fill missing values in 'Comment', 'CommentedUserID', and 'ToWhomTheyReplied' | |
df['Comment'] = df['Comment'].fillna(method='ffill') | |
df['CommentedUserID'] = df['CommentedUserID'].fillna(method='ffill') | |
df['ToWhomTheyReplied'] = df['ToWhomTheyReplied'].fillna(df['Channel']) | |
# Drop rows where 'Reply' column is missing | |
before = df.shape[0] | |
df.dropna(subset=['Reply'], inplace=True) | |
after = df.shape[0] | |
# Calculate comment and reply lengths | |
df['comment_length'] = df['Comment'].str.len() | |
df['reply_length'] = df['Reply'].str.len() | |
# Remove duplicate rows | |
num_duplicates = df.duplicated().sum() | |
df_deduplicated = df.drop_duplicates() | |
# Print number of duplicates | |
print('Number of duplicate rows:', num_duplicates) | |
return df_deduplicated | |