kajalag commited on
Commit
c2f0414
1 Parent(s): d4e0f70

Upload preprocessor.py

Browse files
Files changed (1) hide show
  1. preprocessor.py +111 -0
preprocessor.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import re
3
+ from textblob import TextBlob
4
+ import numpy as np
5
+ import nltk
6
+ import nltk.data
7
+ from nltk.sentiment.vader import SentimentIntensityAnalyzer
8
+ from tqdm.notebook import tqdm
9
+ sia=SentimentIntensityAnalyzer()
10
+ nltk.download('vader_lexicon')
11
+
12
+ def preprocess(data):
13
+ pattern ='\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s-\s'
14
+
15
+ messages = re.split(pattern, data)[1:]
16
+ dates = re.findall(pattern, data)
17
+ df = pd.DataFrame({'user_message': messages, 'message_date': dates})
18
+ df['message_date'] = pd.to_datetime(df['message_date'], format='%m/%d/%y, %H:%M - ')
19
+ df.rename(columns={'message_date': 'date'}, inplace=True)
20
+ users = []
21
+ messages = []
22
+ for message in df['user_message']:
23
+ entry = re.split('([\w\W]+?):\s', message)
24
+
25
+ if entry[1:]:
26
+ users.append(entry[1])
27
+ messages.append(entry[2])
28
+
29
+ else:
30
+ users.append('group_notification')
31
+ messages.append(entry[0])
32
+ df['users'] = users
33
+ df['message'] = messages
34
+ df.drop(columns=['user_message'], inplace=True)
35
+ df['year'] = df['date'].dt.year
36
+ df['day'] = df['date'].dt.day
37
+ df['hour'] = df['date'].dt.hour
38
+ df['minute'] = df['date'].dt.minute
39
+ df['Day_name'] = df['date'].dt.day_name()
40
+ df['Date']=df['date'].dt.date
41
+ df['Month'] = df['date'].dt.month
42
+ df['Month_name'] = df['date'].dt.month_name()
43
+
44
+ period = []
45
+ for hour in df[['Day_name', 'hour']]['hour']:
46
+ if hour == 23:
47
+ period.append(str(hour) + "-" + str('00'))
48
+ elif hour == 0:
49
+ period.append(str('00') + "-" + str(hour + 1))
50
+ else:
51
+ period.append(str(hour) + "-" + str(hour + 1))
52
+
53
+ df['period']=period
54
+
55
+ temp = df[df['users'] != 'group_notification']
56
+ temp = temp[temp['message'] != '<Media omitted>\n']
57
+ temp.replace("", np.nan, inplace=True)
58
+ temp = temp.dropna()
59
+
60
+ def cleanTxt(text):
61
+ text = re.sub(r'@[A-Za-z0-9]+', '', text)
62
+ text = re.sub(r'#', '', text)
63
+ text = text.replace('\n', "")
64
+ return text
65
+
66
+ temp['message'] = temp['message'].apply(cleanTxt)
67
+ temp['users'] = temp['users'].apply(cleanTxt)
68
+
69
+ res = {}
70
+ for i, row in tqdm(temp.iterrows(), total=len(temp)):
71
+ text = row['message']
72
+ myid = row['users']
73
+ res[myid] = sia.polarity_scores(text)
74
+
75
+ vaders = pd.DataFrame(res).T
76
+ vaders = vaders.reset_index().rename(columns={'index': 'users'})
77
+ vaders = vaders.merge(temp, how="right")
78
+ vaders_new = vaders.pop('message')
79
+ vaders_new = pd.DataFrame(vaders_new)
80
+ vaders.insert(1, "message", vaders_new['message'])
81
+
82
+ def getSubjectivity(text):
83
+ return TextBlob(text).sentiment.subjectivity
84
+
85
+ def getPolarity(text):
86
+ return TextBlob(text).sentiment.polarity
87
+
88
+ vaders['Subjectivity'] = vaders['message'].apply(getSubjectivity)
89
+ vaders['Polarity'] = vaders['message'].apply(getPolarity)
90
+
91
+ def getAnalysis(score):
92
+ if score < 0:
93
+ return 'Negative'
94
+ if score == 0:
95
+ return 'Neutral'
96
+ else:
97
+ return 'Positive'
98
+
99
+ vaders['Analysis'] = vaders['Polarity'].apply(getAnalysis)
100
+
101
+ def getAnalysis(score):
102
+ if score <= 0:
103
+ return 'Negative'
104
+ if score < 0.2960:
105
+ return 'Neutral'
106
+ else:
107
+ return 'Positive'
108
+
109
+ vaders['vader_Analysis'] = vaders['compound'].apply(getAnalysis)
110
+
111
+ return vaders