Spaces:
Sleeping
Sleeping
Create data_cleaning.py
Browse files- data_cleaning.py +28 -0
data_cleaning.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
|
3 |
+
def clean_data(df):
|
4 |
+
# Remove 'http://www.youtube.com/@' from the 'Channel' column
|
5 |
+
df['Channel'] = df['Channel'].str.replace('http://www.youtube.com/@', '', regex=False)
|
6 |
+
|
7 |
+
# Fill missing values in 'Comment', 'CommentedUserID', and 'ToWhomTheyReplied'
|
8 |
+
df['Comment'] = df['Comment'].fillna(method='ffill')
|
9 |
+
df['CommentedUserID'] = df['CommentedUserID'].fillna(method='ffill')
|
10 |
+
df['ToWhomTheyReplied'] = df['ToWhomTheyReplied'].fillna(df['Channel'])
|
11 |
+
|
12 |
+
# Drop rows where 'Reply' column is missing
|
13 |
+
before = df.shape[0]
|
14 |
+
df.dropna(subset=['Reply'], inplace=True)
|
15 |
+
after = df.shape[0]
|
16 |
+
|
17 |
+
# Calculate comment and reply lengths
|
18 |
+
df['comment_length'] = df['Comment'].str.len()
|
19 |
+
df['reply_length'] = df['Reply'].str.len()
|
20 |
+
|
21 |
+
# Remove duplicate rows
|
22 |
+
num_duplicates = df.duplicated().sum()
|
23 |
+
df_deduplicated = df.drop_duplicates()
|
24 |
+
|
25 |
+
# Print number of duplicates
|
26 |
+
print('Number of duplicate rows:', num_duplicates)
|
27 |
+
|
28 |
+
return df_deduplicated
|