ruba2ksa commited on
Commit
10c9523
1 Parent(s): f1e37ca

Upload Emotion Detection in Text.ipynb

Browse files
Files changed (1) hide show
  1. Emotion Detection in Text.ipynb +785 -0
Emotion Detection in Text.ipynb ADDED
@@ -0,0 +1,785 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "56cccab6",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Emotions Detection in Text"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": 1,
14
+ "id": "f0814628-3d83-4fd6-a511-2eccf79f9f1e",
15
+ "metadata": {},
16
+ "outputs": [],
17
+ "source": [
18
+ "# EDA\n",
19
+ "import pandas as pd\n",
20
+ "import numpy as np\n",
21
+ "\n",
22
+ "# Load Data Viz Pkgs\n",
23
+ "import seaborn as sns\n",
24
+ "\n",
25
+ "# Load Text Cleaning Pkgs\n",
26
+ "import neattext.functions as nfx\n",
27
+ "\n",
28
+ "# Load ML Pkgs\n",
29
+ "# Estimators\n",
30
+ "from sklearn.linear_model import LogisticRegression\n",
31
+ "from sklearn.naive_bayes import MultinomialNB\n",
32
+ "\n",
33
+ "# Transformers\n",
34
+ "from sklearn.feature_extraction.text import CountVectorizer\n",
35
+ "from sklearn.model_selection import train_test_split\n",
36
+ "from sklearn.metrics import accuracy_score,classification_report,confusion_matrix"
37
+ ]
38
+ },
39
+ {
40
+ "cell_type": "code",
41
+ "execution_count": 2,
42
+ "id": "b209e004-ab77-4407-8689-b4318944d47f",
43
+ "metadata": {},
44
+ "outputs": [],
45
+ "source": [
46
+ "# Load Dataset\n",
47
+ "df = pd.read_csv(\"../data/emotion_dataset_raw.csv\")"
48
+ ]
49
+ },
50
+ {
51
+ "cell_type": "code",
52
+ "execution_count": 3,
53
+ "id": "fea2d4c0-3bdd-405e-ab69-507ceaac36cb",
54
+ "metadata": {},
55
+ "outputs": [
56
+ {
57
+ "data": {
58
+ "text/html": [
59
+ "<div>\n",
60
+ "<style scoped>\n",
61
+ " .dataframe tbody tr th:only-of-type {\n",
62
+ " vertical-align: middle;\n",
63
+ " }\n",
64
+ "\n",
65
+ " .dataframe tbody tr th {\n",
66
+ " vertical-align: top;\n",
67
+ " }\n",
68
+ "\n",
69
+ " .dataframe thead th {\n",
70
+ " text-align: right;\n",
71
+ " }\n",
72
+ "</style>\n",
73
+ "<table border=\"1\" class=\"dataframe\">\n",
74
+ " <thead>\n",
75
+ " <tr style=\"text-align: right;\">\n",
76
+ " <th></th>\n",
77
+ " <th>Emotion</th>\n",
78
+ " <th>Text</th>\n",
79
+ " </tr>\n",
80
+ " </thead>\n",
81
+ " <tbody>\n",
82
+ " <tr>\n",
83
+ " <th>0</th>\n",
84
+ " <td>neutral</td>\n",
85
+ " <td>Why ?</td>\n",
86
+ " </tr>\n",
87
+ " <tr>\n",
88
+ " <th>1</th>\n",
89
+ " <td>joy</td>\n",
90
+ " <td>Sage Act upgrade on my to do list for tommorow.</td>\n",
91
+ " </tr>\n",
92
+ " <tr>\n",
93
+ " <th>2</th>\n",
94
+ " <td>sadness</td>\n",
95
+ " <td>ON THE WAY TO MY HOMEGIRL BABY FUNERAL!!! MAN ...</td>\n",
96
+ " </tr>\n",
97
+ " <tr>\n",
98
+ " <th>3</th>\n",
99
+ " <td>joy</td>\n",
100
+ " <td>Such an eye ! The true hazel eye-and so brill...</td>\n",
101
+ " </tr>\n",
102
+ " <tr>\n",
103
+ " <th>4</th>\n",
104
+ " <td>joy</td>\n",
105
+ " <td>@Iluvmiasantos ugh babe.. hugggzzz for u .! b...</td>\n",
106
+ " </tr>\n",
107
+ " </tbody>\n",
108
+ "</table>\n",
109
+ "</div>"
110
+ ],
111
+ "text/plain": [
112
+ " Emotion Text\n",
113
+ "0 neutral Why ? \n",
114
+ "1 joy Sage Act upgrade on my to do list for tommorow.\n",
115
+ "2 sadness ON THE WAY TO MY HOMEGIRL BABY FUNERAL!!! MAN ...\n",
116
+ "3 joy Such an eye ! The true hazel eye-and so brill...\n",
117
+ "4 joy @Iluvmiasantos ugh babe.. hugggzzz for u .! b..."
118
+ ]
119
+ },
120
+ "execution_count": 3,
121
+ "metadata": {},
122
+ "output_type": "execute_result"
123
+ }
124
+ ],
125
+ "source": [
126
+ "df.head()"
127
+ ]
128
+ },
129
+ {
130
+ "cell_type": "code",
131
+ "execution_count": 4,
132
+ "id": "430565a3-cf3b-4c6f-afa5-bafd084f5676",
133
+ "metadata": {},
134
+ "outputs": [
135
+ {
136
+ "data": {
137
+ "text/plain": [
138
+ "joy 11045\n",
139
+ "sadness 6722\n",
140
+ "fear 5410\n",
141
+ "anger 4297\n",
142
+ "surprise 4062\n",
143
+ "neutral 2254\n",
144
+ "disgust 856\n",
145
+ "shame 146\n",
146
+ "Name: Emotion, dtype: int64"
147
+ ]
148
+ },
149
+ "execution_count": 4,
150
+ "metadata": {},
151
+ "output_type": "execute_result"
152
+ }
153
+ ],
154
+ "source": [
155
+ "# Value Counts\n",
156
+ "df['Emotion'].value_counts()"
157
+ ]
158
+ },
159
+ {
160
+ "cell_type": "code",
161
+ "execution_count": 5,
162
+ "id": "531d3449-a959-4a19-bff0-3ffed551e619",
163
+ "metadata": {},
164
+ "outputs": [
165
+ {
166
+ "data": {
167
+ "text/plain": [
168
+ "<Axes: xlabel='Emotion', ylabel='count'>"
169
+ ]
170
+ },
171
+ "execution_count": 5,
172
+ "metadata": {},
173
+ "output_type": "execute_result"
174
+ },
175
+ {
176
+ "data": {
177
+ "image/png": "\n",
178
+ "text/plain": [
179
+ "<Figure size 640x480 with 1 Axes>"
180
+ ]
181
+ },
182
+ "metadata": {},
183
+ "output_type": "display_data"
184
+ }
185
+ ],
186
+ "source": [
187
+ "# Plot\n",
188
+ "sns.countplot(x='Emotion',data=df)"
189
+ ]
190
+ },
191
+ {
192
+ "cell_type": "code",
193
+ "execution_count": 6,
194
+ "id": "40f991d0-952f-40c1-bf00-f3476ce0436d",
195
+ "metadata": {
196
+ "jupyter": {
197
+ "outputs_hidden": true
198
+ },
199
+ "scrolled": false,
200
+ "tags": []
201
+ },
202
+ "outputs": [
203
+ {
204
+ "data": {
205
+ "text/plain": [
206
+ "['BTC_ADDRESS_REGEX',\n",
207
+ " 'CURRENCY_REGEX',\n",
208
+ " 'CURRENCY_SYMB_REGEX',\n",
209
+ " 'Counter',\n",
210
+ " 'DATE_REGEX',\n",
211
+ " 'EMAIL_REGEX',\n",
212
+ " 'EMOJI_REGEX',\n",
213
+ " 'HASTAG_REGEX',\n",
214
+ " 'MASTERCard_REGEX',\n",
215
+ " 'MD5_SHA_REGEX',\n",
216
+ " 'MOST_COMMON_PUNCT_REGEX',\n",
217
+ " 'NUMBERS_REGEX',\n",
218
+ " 'PHONE_REGEX',\n",
219
+ " 'PoBOX_REGEX',\n",
220
+ " 'SPECIAL_CHARACTERS_REGEX',\n",
221
+ " 'STOPWORDS',\n",
222
+ " 'STOPWORDS_de',\n",
223
+ " 'STOPWORDS_en',\n",
224
+ " 'STOPWORDS_es',\n",
225
+ " 'STOPWORDS_fr',\n",
226
+ " 'STOPWORDS_ru',\n",
227
+ " 'STOPWORDS_yo',\n",
228
+ " 'STREET_ADDRESS_REGEX',\n",
229
+ " 'TextFrame',\n",
230
+ " 'URL_PATTERN',\n",
231
+ " 'USER_HANDLES_REGEX',\n",
232
+ " 'VISACard_REGEX',\n",
233
+ " '__builtins__',\n",
234
+ " '__cached__',\n",
235
+ " '__doc__',\n",
236
+ " '__file__',\n",
237
+ " '__generate_text',\n",
238
+ " '__loader__',\n",
239
+ " '__name__',\n",
240
+ " '__numbers_dict',\n",
241
+ " '__package__',\n",
242
+ " '__spec__',\n",
243
+ " '_lex_richness_herdan',\n",
244
+ " '_lex_richness_maas_ttr',\n",
245
+ " 'clean_text',\n",
246
+ " 'defaultdict',\n",
247
+ " 'digit2words',\n",
248
+ " 'extract_btc_address',\n",
249
+ " 'extract_currencies',\n",
250
+ " 'extract_currency_symbols',\n",
251
+ " 'extract_dates',\n",
252
+ " 'extract_emails',\n",
253
+ " 'extract_emojis',\n",
254
+ " 'extract_hashtags',\n",
255
+ " 'extract_html_tags',\n",
256
+ " 'extract_mastercard_addr',\n",
257
+ " 'extract_md5sha',\n",
258
+ " 'extract_numbers',\n",
259
+ " 'extract_pattern',\n",
260
+ " 'extract_phone_numbers',\n",
261
+ " 'extract_postoffice_box',\n",
262
+ " 'extract_shortwords',\n",
263
+ " 'extract_special_characters',\n",
264
+ " 'extract_stopwords',\n",
265
+ " 'extract_street_address',\n",
266
+ " 'extract_terms_in_bracket',\n",
267
+ " 'extract_urls',\n",
268
+ " 'extract_userhandles',\n",
269
+ " 'extract_visacard_addr',\n",
270
+ " 'fix_contractions',\n",
271
+ " 'generate_sentence',\n",
272
+ " 'hamming_distance',\n",
273
+ " 'inverse_df',\n",
274
+ " 'lexical_richness',\n",
275
+ " 'markov_chain',\n",
276
+ " 'math',\n",
277
+ " 'nlargest',\n",
278
+ " 'normalize',\n",
279
+ " 'num2words',\n",
280
+ " 'random',\n",
281
+ " 're',\n",
282
+ " 'read_txt',\n",
283
+ " 'remove_accents',\n",
284
+ " 'remove_bad_quotes',\n",
285
+ " 'remove_btc_address',\n",
286
+ " 'remove_currencies',\n",
287
+ " 'remove_currency_symbols',\n",
288
+ " 'remove_custom_pattern',\n",
289
+ " 'remove_custom_words',\n",
290
+ " 'remove_dates',\n",
291
+ " 'remove_emails',\n",
292
+ " 'remove_emojis',\n",
293
+ " 'remove_hashtags',\n",
294
+ " 'remove_html_tags',\n",
295
+ " 'remove_mastercard_addr',\n",
296
+ " 'remove_md5sha',\n",
297
+ " 'remove_multiple_spaces',\n",
298
+ " 'remove_non_ascii',\n",
299
+ " 'remove_numbers',\n",
300
+ " 'remove_phone_numbers',\n",
301
+ " 'remove_postoffice_box',\n",
302
+ " 'remove_puncts',\n",
303
+ " 'remove_punctuations',\n",
304
+ " 'remove_shortwords',\n",
305
+ " 'remove_special_characters',\n",
306
+ " 'remove_stopwords',\n",
307
+ " 'remove_street_address',\n",
308
+ " 'remove_terms_in_bracket',\n",
309
+ " 'remove_urls',\n",
310
+ " 'remove_userhandles',\n",
311
+ " 'remove_visacard_addr',\n",
312
+ " 'replace_bad_quotes',\n",
313
+ " 'replace_currencies',\n",
314
+ " 'replace_currency_symbols',\n",
315
+ " 'replace_dates',\n",
316
+ " 'replace_emails',\n",
317
+ " 'replace_emojis',\n",
318
+ " 'replace_numbers',\n",
319
+ " 'replace_phone_numbers',\n",
320
+ " 'replace_special_characters',\n",
321
+ " 'replace_term',\n",
322
+ " 'replace_urls',\n",
323
+ " 'string',\n",
324
+ " 'term_freq',\n",
325
+ " 'to_txt',\n",
326
+ " 'unicodedata',\n",
327
+ " 'word_freq',\n",
328
+ " 'word_length_freq']"
329
+ ]
330
+ },
331
+ "execution_count": 6,
332
+ "metadata": {},
333
+ "output_type": "execute_result"
334
+ }
335
+ ],
336
+ "source": [
337
+ "# Data Cleaning\n",
338
+ "dir(nfx)"
339
+ ]
340
+ },
341
+ {
342
+ "cell_type": "code",
343
+ "execution_count": 7,
344
+ "id": "b1f87847-a91c-4bd6-a307-d746eb5aa9a0",
345
+ "metadata": {},
346
+ "outputs": [],
347
+ "source": [
348
+ "# User handles\n",
349
+ "df['Clean_Text'] = df['Text'].apply(nfx.remove_userhandles)"
350
+ ]
351
+ },
352
+ {
353
+ "cell_type": "code",
354
+ "execution_count": 8,
355
+ "id": "03886bc3-1ac4-4f1b-842b-e5d2d770ff81",
356
+ "metadata": {},
357
+ "outputs": [],
358
+ "source": [
359
+ "# Stopwords\n",
360
+ "df['Clean_Text'] = df['Clean_Text'].apply(nfx.remove_stopwords)"
361
+ ]
362
+ },
363
+ {
364
+ "cell_type": "markdown",
365
+ "id": "0ffcf4c7",
366
+ "metadata": {},
367
+ "source": [
368
+ "## We are not removing Special Characters as some of the rows have just Special Characters and it'll result into empty row."
369
+ ]
370
+ },
371
+ {
372
+ "cell_type": "code",
373
+ "execution_count": 9,
374
+ "id": "0a0fcc0c-4adf-4f0b-b226-164659ad70ba",
375
+ "metadata": {
376
+ "jupyter": {
377
+ "outputs_hidden": true
378
+ },
379
+ "tags": []
380
+ },
381
+ "outputs": [
382
+ {
383
+ "data": {
384
+ "text/html": [
385
+ "<div>\n",
386
+ "<style scoped>\n",
387
+ " .dataframe tbody tr th:only-of-type {\n",
388
+ " vertical-align: middle;\n",
389
+ " }\n",
390
+ "\n",
391
+ " .dataframe tbody tr th {\n",
392
+ " vertical-align: top;\n",
393
+ " }\n",
394
+ "\n",
395
+ " .dataframe thead th {\n",
396
+ " text-align: right;\n",
397
+ " }\n",
398
+ "</style>\n",
399
+ "<table border=\"1\" class=\"dataframe\">\n",
400
+ " <thead>\n",
401
+ " <tr style=\"text-align: right;\">\n",
402
+ " <th></th>\n",
403
+ " <th>Emotion</th>\n",
404
+ " <th>Text</th>\n",
405
+ " <th>Clean_Text</th>\n",
406
+ " </tr>\n",
407
+ " </thead>\n",
408
+ " <tbody>\n",
409
+ " <tr>\n",
410
+ " <th>0</th>\n",
411
+ " <td>neutral</td>\n",
412
+ " <td>Why ?</td>\n",
413
+ " <td>?</td>\n",
414
+ " </tr>\n",
415
+ " <tr>\n",
416
+ " <th>1</th>\n",
417
+ " <td>joy</td>\n",
418
+ " <td>Sage Act upgrade on my to do list for tommorow.</td>\n",
419
+ " <td>Sage Act upgrade list tommorow.</td>\n",
420
+ " </tr>\n",
421
+ " <tr>\n",
422
+ " <th>2</th>\n",
423
+ " <td>sadness</td>\n",
424
+ " <td>ON THE WAY TO MY HOMEGIRL BABY FUNERAL!!! MAN ...</td>\n",
425
+ " <td>WAY HOMEGIRL BABY FUNERAL!!! MAN HATE FUNERALS...</td>\n",
426
+ " </tr>\n",
427
+ " <tr>\n",
428
+ " <th>3</th>\n",
429
+ " <td>joy</td>\n",
430
+ " <td>Such an eye ! The true hazel eye-and so brill...</td>\n",
431
+ " <td>eye ! true hazel eye-and brilliant ! Regular f...</td>\n",
432
+ " </tr>\n",
433
+ " <tr>\n",
434
+ " <th>4</th>\n",
435
+ " <td>joy</td>\n",
436
+ " <td>@Iluvmiasantos ugh babe.. hugggzzz for u .! b...</td>\n",
437
+ " <td>ugh babe.. hugggzzz u .! babe naamazed nga ako...</td>\n",
438
+ " </tr>\n",
439
+ " <tr>\n",
440
+ " <th>...</th>\n",
441
+ " <td>...</td>\n",
442
+ " <td>...</td>\n",
443
+ " <td>...</td>\n",
444
+ " </tr>\n",
445
+ " <tr>\n",
446
+ " <th>34787</th>\n",
447
+ " <td>surprise</td>\n",
448
+ " <td>@MichelGW have you gift! Hope you like it! It'...</td>\n",
449
+ " <td>gift! Hope like it! hand wear ! It'll warm! Lol</td>\n",
450
+ " </tr>\n",
451
+ " <tr>\n",
452
+ " <th>34788</th>\n",
453
+ " <td>joy</td>\n",
454
+ " <td>The world didnt give it to me..so the world MO...</td>\n",
455
+ " <td>world didnt me..so world DEFINITELY cnt away!!!</td>\n",
456
+ " </tr>\n",
457
+ " <tr>\n",
458
+ " <th>34789</th>\n",
459
+ " <td>anger</td>\n",
460
+ " <td>A man robbed me today .</td>\n",
461
+ " <td>man robbed today .</td>\n",
462
+ " </tr>\n",
463
+ " <tr>\n",
464
+ " <th>34790</th>\n",
465
+ " <td>fear</td>\n",
466
+ " <td>Youu call it JEALOUSY, I call it of #Losing YO...</td>\n",
467
+ " <td>Youu JEALOUSY, #Losing YOU...</td>\n",
468
+ " </tr>\n",
469
+ " <tr>\n",
470
+ " <th>34791</th>\n",
471
+ " <td>sadness</td>\n",
472
+ " <td>I think about you baby, and I dream about you ...</td>\n",
473
+ " <td>think baby, dream time</td>\n",
474
+ " </tr>\n",
475
+ " </tbody>\n",
476
+ "</table>\n",
477
+ "<p>34792 rows × 3 columns</p>\n",
478
+ "</div>"
479
+ ],
480
+ "text/plain": [
481
+ " Emotion Text \\\n",
482
+ "0 neutral Why ? \n",
483
+ "1 joy Sage Act upgrade on my to do list for tommorow. \n",
484
+ "2 sadness ON THE WAY TO MY HOMEGIRL BABY FUNERAL!!! MAN ... \n",
485
+ "3 joy Such an eye ! The true hazel eye-and so brill... \n",
486
+ "4 joy @Iluvmiasantos ugh babe.. hugggzzz for u .! b... \n",
487
+ "... ... ... \n",
488
+ "34787 surprise @MichelGW have you gift! Hope you like it! It'... \n",
489
+ "34788 joy The world didnt give it to me..so the world MO... \n",
490
+ "34789 anger A man robbed me today . \n",
491
+ "34790 fear Youu call it JEALOUSY, I call it of #Losing YO... \n",
492
+ "34791 sadness I think about you baby, and I dream about you ... \n",
493
+ "\n",
494
+ " Clean_Text \n",
495
+ "0 ? \n",
496
+ "1 Sage Act upgrade list tommorow. \n",
497
+ "2 WAY HOMEGIRL BABY FUNERAL!!! MAN HATE FUNERALS... \n",
498
+ "3 eye ! true hazel eye-and brilliant ! Regular f... \n",
499
+ "4 ugh babe.. hugggzzz u .! babe naamazed nga ako... \n",
500
+ "... ... \n",
501
+ "34787 gift! Hope like it! hand wear ! It'll warm! Lol \n",
502
+ "34788 world didnt me..so world DEFINITELY cnt away!!! \n",
503
+ "34789 man robbed today . \n",
504
+ "34790 Youu JEALOUSY, #Losing YOU... \n",
505
+ "34791 think baby, dream time \n",
506
+ "\n",
507
+ "[34792 rows x 3 columns]"
508
+ ]
509
+ },
510
+ "execution_count": 9,
511
+ "metadata": {},
512
+ "output_type": "execute_result"
513
+ }
514
+ ],
515
+ "source": [
516
+ "df"
517
+ ]
518
+ },
519
+ {
520
+ "cell_type": "code",
521
+ "execution_count": 10,
522
+ "id": "450c39c0-79dd-4eaf-85fe-57e344eb81bd",
523
+ "metadata": {},
524
+ "outputs": [],
525
+ "source": [
526
+ "# Features & Labels\n",
527
+ "Xfeatures = df['Clean_Text']\n",
528
+ "ylabels = df['Emotion']"
529
+ ]
530
+ },
531
+ {
532
+ "cell_type": "markdown",
533
+ "id": "edde3d4b",
534
+ "metadata": {},
535
+ "source": [
536
+ "# It is advisable to split before applying pipelines because it prevents data leakage."
537
+ ]
538
+ },
539
+ {
540
+ "cell_type": "code",
541
+ "execution_count": 11,
542
+ "id": "27d7f976-c28f-449e-ae1a-53a42bbda4e8",
543
+ "metadata": {},
544
+ "outputs": [],
545
+ "source": [
546
+ "# Split Data\n",
547
+ "x_train,x_test,y_train,y_test = train_test_split(Xfeatures,ylabels,test_size=0.3,random_state=42)"
548
+ ]
549
+ },
550
+ {
551
+ "cell_type": "code",
552
+ "execution_count": 12,
553
+ "id": "2f086f29-dba9-40d2-a9dd-f06a6cca3a4c",
554
+ "metadata": {},
555
+ "outputs": [],
556
+ "source": [
557
+ "# Build Pipeline\n",
558
+ "from sklearn.pipeline import Pipeline"
559
+ ]
560
+ },
561
+ {
562
+ "cell_type": "code",
563
+ "execution_count": 13,
564
+ "id": "6b81cc86-2bef-40c2-b9a3-668caaadaff0",
565
+ "metadata": {},
566
+ "outputs": [],
567
+ "source": [
568
+ "# LogisticRegression Pipeline\n",
569
+ "pipe_lr = Pipeline(steps=[('cv',CountVectorizer()),('lr',LogisticRegression())])"
570
+ ]
571
+ },
572
+ {
573
+ "cell_type": "code",
574
+ "execution_count": 14,
575
+ "id": "dc64b9a7-efe2-4bc4-a0e7-46dff1d52b31",
576
+ "metadata": {
577
+ "jupyter": {
578
+ "outputs_hidden": true
579
+ },
580
+ "scrolled": false,
581
+ "tags": []
582
+ },
583
+ "outputs": [
584
+ {
585
+ "name": "stderr",
586
+ "output_type": "stream",
587
+ "text": [
588
+ "C:\\Users\\Sanket\\anaconda3\\envs\\nlp\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
589
+ "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
590
+ "\n",
591
+ "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
592
+ " https://scikit-learn.org/stable/modules/preprocessing.html\n",
593
+ "Please also refer to the documentation for alternative solver options:\n",
594
+ " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
595
+ " n_iter_i = _check_optimize_result(\n"
596
+ ]
597
+ },
598
+ {
599
+ "data": {
600
+ "text/html": [
601
+ "<style>#sk-container-id-1 {color: black;background-color: white;}#sk-container-id-1 pre{padding: 0;}#sk-container-id-1 div.sk-toggleable {background-color: white;}#sk-container-id-1 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-1 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-1 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-1 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-1 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-1 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-1 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-1 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-1 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-1 div.sk-item {position: relative;z-index: 1;}#sk-container-id-1 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-1 div.sk-item::before, #sk-container-id-1 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-1 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-1 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-1 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-1 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-1 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-1 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-1 div.sk-label-container {text-align: center;}#sk-container-id-1 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-1 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>Pipeline(steps=[(&#x27;cv&#x27;, CountVectorizer()), (&#x27;lr&#x27;, LogisticRegression())])</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" ><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">Pipeline</label><div class=\"sk-toggleable__content\"><pre>Pipeline(steps=[(&#x27;cv&#x27;, CountVectorizer()), (&#x27;lr&#x27;, LogisticRegression())])</pre></div></div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-2\" type=\"checkbox\" ><label for=\"sk-estimator-id-2\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">CountVectorizer</label><div class=\"sk-toggleable__content\"><pre>CountVectorizer()</pre></div></div></div><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-3\" type=\"checkbox\" ><label for=\"sk-estimator-id-3\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">LogisticRegression</label><div class=\"sk-toggleable__content\"><pre>LogisticRegression()</pre></div></div></div></div></div></div></div>"
602
+ ],
603
+ "text/plain": [
604
+ "Pipeline(steps=[('cv', CountVectorizer()), ('lr', LogisticRegression())])"
605
+ ]
606
+ },
607
+ "execution_count": 14,
608
+ "metadata": {},
609
+ "output_type": "execute_result"
610
+ }
611
+ ],
612
+ "source": [
613
+ "# Train and Fit Data\n",
614
+ "pipe_lr.fit(x_train,y_train)"
615
+ ]
616
+ },
617
+ {
618
+ "cell_type": "code",
619
+ "execution_count": 15,
620
+ "id": "135ed6f8-56ff-4d53-85e3-541e3a7ae2d7",
621
+ "metadata": {},
622
+ "outputs": [
623
+ {
624
+ "data": {
625
+ "text/html": [
626
+ "<style>#sk-container-id-2 {color: black;background-color: white;}#sk-container-id-2 pre{padding: 0;}#sk-container-id-2 div.sk-toggleable {background-color: white;}#sk-container-id-2 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-2 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-2 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-2 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-2 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-2 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-2 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-2 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-2 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-2 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-2 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-2 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-2 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-2 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-2 div.sk-item {position: relative;z-index: 1;}#sk-container-id-2 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-2 div.sk-item::before, #sk-container-id-2 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-2 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-2 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-2 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-2 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-2 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-2 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-2 div.sk-label-container {text-align: center;}#sk-container-id-2 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-2 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-2\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>Pipeline(steps=[(&#x27;cv&#x27;, CountVectorizer()), (&#x27;lr&#x27;, LogisticRegression())])</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-4\" type=\"checkbox\" ><label for=\"sk-estimator-id-4\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">Pipeline</label><div class=\"sk-toggleable__content\"><pre>Pipeline(steps=[(&#x27;cv&#x27;, CountVectorizer()), (&#x27;lr&#x27;, LogisticRegression())])</pre></div></div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-5\" type=\"checkbox\" ><label for=\"sk-estimator-id-5\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">CountVectorizer</label><div class=\"sk-toggleable__content\"><pre>CountVectorizer()</pre></div></div></div><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-6\" type=\"checkbox\" ><label for=\"sk-estimator-id-6\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">LogisticRegression</label><div class=\"sk-toggleable__content\"><pre>LogisticRegression()</pre></div></div></div></div></div></div></div>"
627
+ ],
628
+ "text/plain": [
629
+ "Pipeline(steps=[('cv', CountVectorizer()), ('lr', LogisticRegression())])"
630
+ ]
631
+ },
632
+ "execution_count": 15,
633
+ "metadata": {},
634
+ "output_type": "execute_result"
635
+ }
636
+ ],
637
+ "source": [
638
+ "pipe_lr"
639
+ ]
640
+ },
641
+ {
642
+ "cell_type": "code",
643
+ "execution_count": 16,
644
+ "id": "28396371-5f5c-4a3b-b974-164e047764f3",
645
+ "metadata": {},
646
+ "outputs": [
647
+ {
648
+ "data": {
649
+ "text/plain": [
650
+ "0.619946349875455"
651
+ ]
652
+ },
653
+ "execution_count": 16,
654
+ "metadata": {},
655
+ "output_type": "execute_result"
656
+ }
657
+ ],
658
+ "source": [
659
+ "# Check Accuracy\n",
660
+ "pipe_lr.score(x_test,y_test)"
661
+ ]
662
+ },
663
+ {
664
+ "cell_type": "code",
665
+ "execution_count": 17,
666
+ "id": "eb3a26b6-d09e-422f-991b-b08c48f55b75",
667
+ "metadata": {},
668
+ "outputs": [],
669
+ "source": [
670
+ "# Make A Prediction\n",
671
+ "ex1 = \"This book was so interesting it made me happy\""
672
+ ]
673
+ },
674
+ {
675
+ "cell_type": "code",
676
+ "execution_count": 18,
677
+ "id": "b08597d9-6f59-45cb-a648-95b0da1ce313",
678
+ "metadata": {},
679
+ "outputs": [
680
+ {
681
+ "data": {
682
+ "text/plain": [
683
+ "array(['joy'], dtype=object)"
684
+ ]
685
+ },
686
+ "execution_count": 18,
687
+ "metadata": {},
688
+ "output_type": "execute_result"
689
+ }
690
+ ],
691
+ "source": [
692
+ "pipe_lr.predict([ex1])"
693
+ ]
694
+ },
695
+ {
696
+ "cell_type": "code",
697
+ "execution_count": 19,
698
+ "id": "5b3822ac-17fc-43dd-9bb7-8dad07a4d32c",
699
+ "metadata": {},
700
+ "outputs": [
701
+ {
702
+ "data": {
703
+ "text/plain": [
704
+ "array([[1.60505334e-03, 7.06448086e-03, 6.95652453e-03, 9.43810868e-01,\n",
705
+ " 1.00440585e-04, 2.63232385e-02, 6.63277122e-05, 1.40730665e-02]])"
706
+ ]
707
+ },
708
+ "execution_count": 19,
709
+ "metadata": {},
710
+ "output_type": "execute_result"
711
+ }
712
+ ],
713
+ "source": [
714
+ "# Prediction Prob\n",
715
+ "pipe_lr.predict_proba([ex1])"
716
+ ]
717
+ },
718
+ {
719
+ "cell_type": "code",
720
+ "execution_count": 20,
721
+ "id": "5b7c4596-d643-48e5-a777-79a6f55c49da",
722
+ "metadata": {},
723
+ "outputs": [
724
+ {
725
+ "data": {
726
+ "text/plain": [
727
+ "array(['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'shame',\n",
728
+ " 'surprise'], dtype=object)"
729
+ ]
730
+ },
731
+ "execution_count": 20,
732
+ "metadata": {},
733
+ "output_type": "execute_result"
734
+ }
735
+ ],
736
+ "source": [
737
+ "# To Know the classes\n",
738
+ "pipe_lr.classes_"
739
+ ]
740
+ },
741
+ {
742
+ "cell_type": "code",
743
+ "execution_count": 21,
744
+ "id": "c0d40f62-b1fd-4748-a279-c8f50c748f26",
745
+ "metadata": {},
746
+ "outputs": [],
747
+ "source": [
748
+ "# Save Model & Pipeline\n",
749
+ "import joblib\n",
750
+ "pipeline_file = open(\"../models/emotion_classifier_pipe_lr.pkl\",\"wb\")\n",
751
+ "joblib.dump(pipe_lr,pipeline_file)\n",
752
+ "pipeline_file.close()"
753
+ ]
754
+ },
755
+ {
756
+ "cell_type": "code",
757
+ "execution_count": null,
758
+ "id": "377c4e98-67f0-45e5-8dd5-0417585754f0",
759
+ "metadata": {},
760
+ "outputs": [],
761
+ "source": []
762
+ }
763
+ ],
764
+ "metadata": {
765
+ "kernelspec": {
766
+ "display_name": "Python 3 (ipykernel)",
767
+ "language": "python",
768
+ "name": "python3"
769
+ },
770
+ "language_info": {
771
+ "codemirror_mode": {
772
+ "name": "ipython",
773
+ "version": 3
774
+ },
775
+ "file_extension": ".py",
776
+ "mimetype": "text/x-python",
777
+ "name": "python",
778
+ "nbconvert_exporter": "python",
779
+ "pygments_lexer": "ipython3",
780
+ "version": "3.10.9"
781
+ }
782
+ },
783
+ "nbformat": 4,
784
+ "nbformat_minor": 5
785
+ }