Abu1998 commited on
Commit
6f75600
1 Parent(s): 39b58b2

Create data_cleaning.py

Browse files
Files changed (1) hide show
  1. data_cleaning.py +28 -0
data_cleaning.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ def clean_data(df):
4
+ # Remove 'http://www.youtube.com/@' from the 'Channel' column
5
+ df['Channel'] = df['Channel'].str.replace('http://www.youtube.com/@', '', regex=False)
6
+
7
+ # Fill missing values in 'Comment', 'CommentedUserID', and 'ToWhomTheyReplied'
8
+ df['Comment'] = df['Comment'].fillna(method='ffill')
9
+ df['CommentedUserID'] = df['CommentedUserID'].fillna(method='ffill')
10
+ df['ToWhomTheyReplied'] = df['ToWhomTheyReplied'].fillna(df['Channel'])
11
+
12
+ # Drop rows where 'Reply' column is missing
13
+ before = df.shape[0]
14
+ df.dropna(subset=['Reply'], inplace=True)
15
+ after = df.shape[0]
16
+
17
+ # Calculate comment and reply lengths
18
+ df['comment_length'] = df['Comment'].str.len()
19
+ df['reply_length'] = df['Reply'].str.len()
20
+
21
+ # Remove duplicate rows
22
+ num_duplicates = df.duplicated().sum()
23
+ df_deduplicated = df.drop_duplicates()
24
+
25
+ # Print number of duplicates
26
+ print('Number of duplicate rows:', num_duplicates)
27
+
28
+ return df_deduplicated