File size: 1,029 Bytes
6f75600
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import pandas as pd

def clean_data(df):
    # Remove 'http://www.youtube.com/@' from the 'Channel' column
    df['Channel'] = df['Channel'].str.replace('http://www.youtube.com/@', '', regex=False)
    
    # Fill missing values in 'Comment', 'CommentedUserID', and 'ToWhomTheyReplied'
    df['Comment'] = df['Comment'].fillna(method='ffill')
    df['CommentedUserID'] = df['CommentedUserID'].fillna(method='ffill')
    df['ToWhomTheyReplied'] = df['ToWhomTheyReplied'].fillna(df['Channel'])
    
    # Drop rows where 'Reply' column is missing
    before = df.shape[0]
    df.dropna(subset=['Reply'], inplace=True)
    after = df.shape[0]
    
    # Calculate comment and reply lengths
    df['comment_length'] = df['Comment'].str.len()
    df['reply_length'] = df['Reply'].str.len()
    
    # Remove duplicate rows
    num_duplicates = df.duplicated().sum()
    df_deduplicated = df.drop_duplicates()
    
    # Print number of duplicates
    print('Number of duplicate rows:', num_duplicates)
    
    return df_deduplicated