Spaces:
Sleeping
Sleeping
File size: 1,029 Bytes
6f75600 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 |
import pandas as pd
def clean_data(df):
# Remove 'http://www.youtube.com/@' from the 'Channel' column
df['Channel'] = df['Channel'].str.replace('http://www.youtube.com/@', '', regex=False)
# Fill missing values in 'Comment', 'CommentedUserID', and 'ToWhomTheyReplied'
df['Comment'] = df['Comment'].fillna(method='ffill')
df['CommentedUserID'] = df['CommentedUserID'].fillna(method='ffill')
df['ToWhomTheyReplied'] = df['ToWhomTheyReplied'].fillna(df['Channel'])
# Drop rows where 'Reply' column is missing
before = df.shape[0]
df.dropna(subset=['Reply'], inplace=True)
after = df.shape[0]
# Calculate comment and reply lengths
df['comment_length'] = df['Comment'].str.len()
df['reply_length'] = df['Reply'].str.len()
# Remove duplicate rows
num_duplicates = df.duplicated().sum()
df_deduplicated = df.drop_duplicates()
# Print number of duplicates
print('Number of duplicate rows:', num_duplicates)
return df_deduplicated
|