import pandas as pd def clean_data(df): # Remove 'http://www.youtube.com/@' from the 'Channel' column df['Channel'] = df['Channel'].str.replace('http://www.youtube.com/@', '', regex=False) # Fill missing values in 'Comment', 'CommentedUserID', and 'ToWhomTheyReplied' df['Comment'] = df['Comment'].fillna(method='ffill') df['CommentedUserID'] = df['CommentedUserID'].fillna(method='ffill') df['ToWhomTheyReplied'] = df['ToWhomTheyReplied'].fillna(df['Channel']) # Drop rows where 'Reply' column is missing before = df.shape[0] df.dropna(subset=['Reply'], inplace=True) after = df.shape[0] # Calculate comment and reply lengths df['comment_length'] = df['Comment'].str.len() df['reply_length'] = df['Reply'].str.len() # Remove duplicate rows num_duplicates = df.duplicated().sum() df_deduplicated = df.drop_duplicates() # Print number of duplicates print('Number of duplicate rows:', num_duplicates) return df_deduplicated