junming-qiu commited on
Commit
4902206
1 Parent(s): b71d527

fine tuned model added

Browse files
Files changed (4) hide show
  1. Final_Project_Fine_Tuning.ipynb +685 -0
  2. app.py +21 -8
  3. init.bash +1 -1
  4. requirements.txt +2 -1
Final_Project_Fine_Tuning.ipynb ADDED
@@ -0,0 +1,685 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {
7
+ "colab": {
8
+ "base_uri": "https://localhost:8080/"
9
+ },
10
+ "id": "K6KNj8R5pFOi",
11
+ "outputId": "73e388e8-294f-438d-ddc2-06ae7132580a"
12
+ },
13
+ "outputs": [],
14
+ "source": [
15
+ "!kaggle competitions download -c jigsaw-toxic-comment-classification-challenge\n",
16
+ "!unzip jigsaw-toxic-comment-classification-challenge"
17
+ ]
18
+ },
19
+ {
20
+ "cell_type": "code",
21
+ "execution_count": null,
22
+ "metadata": {
23
+ "colab": {
24
+ "base_uri": "https://localhost:8080/"
25
+ },
26
+ "id": "-a6Sx13TqW2h",
27
+ "outputId": "eb6bb305-7b66-4f59-e1e3-24858c1309c4"
28
+ },
29
+ "outputs": [],
30
+ "source": [
31
+ "!unzip test.csv.zip \n",
32
+ "!unzip test_labels.csv.zip \n",
33
+ "!unzip train.csv.zip"
34
+ ]
35
+ },
36
+ {
37
+ "cell_type": "code",
38
+ "execution_count": 1,
39
+ "metadata": {
40
+ "id": "Jt-aOqhVqavv"
41
+ },
42
+ "outputs": [],
43
+ "source": [
44
+ "import warnings\n",
45
+ "import pandas as pd\n",
46
+ "import torch\n",
47
+ "import numpy as np\n",
48
+ "from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler\n",
49
+ "from sklearn.model_selection import train_test_split\n",
50
+ "from transformers import Trainer, TrainingArguments\n",
51
+ "from transformers import AutoTokenizer, AutoModelForSequenceClassification"
52
+ ]
53
+ },
54
+ {
55
+ "cell_type": "code",
56
+ "execution_count": 2,
57
+ "metadata": {},
58
+ "outputs": [],
59
+ "source": [
60
+ "warnings.filterwarnings('ignore')"
61
+ ]
62
+ },
63
+ {
64
+ "cell_type": "code",
65
+ "execution_count": 3,
66
+ "metadata": {},
67
+ "outputs": [
68
+ {
69
+ "name": "stdout",
70
+ "output_type": "stream",
71
+ "text": [
72
+ "mps:0\n"
73
+ ]
74
+ }
75
+ ],
76
+ "source": [
77
+ "# Use GPU\n",
78
+ "#device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n",
79
+ "device = \"mps:0\" if torch.backends.mps.is_available() else \"cpu\"\n",
80
+ "print(device)"
81
+ ]
82
+ },
83
+ {
84
+ "cell_type": "code",
85
+ "execution_count": 4,
86
+ "metadata": {
87
+ "id": "zMDF7x0H4VFW"
88
+ },
89
+ "outputs": [
90
+ {
91
+ "data": {
92
+ "text/html": [
93
+ "<div>\n",
94
+ "<style scoped>\n",
95
+ " .dataframe tbody tr th:only-of-type {\n",
96
+ " vertical-align: middle;\n",
97
+ " }\n",
98
+ "\n",
99
+ " .dataframe tbody tr th {\n",
100
+ " vertical-align: top;\n",
101
+ " }\n",
102
+ "\n",
103
+ " .dataframe thead th {\n",
104
+ " text-align: right;\n",
105
+ " }\n",
106
+ "</style>\n",
107
+ "<table border=\"1\" class=\"dataframe\">\n",
108
+ " <thead>\n",
109
+ " <tr style=\"text-align: right;\">\n",
110
+ " <th></th>\n",
111
+ " <th>id</th>\n",
112
+ " <th>comment_text</th>\n",
113
+ " <th>toxic</th>\n",
114
+ " <th>severe_toxic</th>\n",
115
+ " <th>obscene</th>\n",
116
+ " <th>threat</th>\n",
117
+ " <th>insult</th>\n",
118
+ " <th>identity_hate</th>\n",
119
+ " </tr>\n",
120
+ " </thead>\n",
121
+ " <tbody>\n",
122
+ " <tr>\n",
123
+ " <th>0</th>\n",
124
+ " <td>0000997932d777bf</td>\n",
125
+ " <td>Explanation\\nWhy the edits made under my usern...</td>\n",
126
+ " <td>0</td>\n",
127
+ " <td>0</td>\n",
128
+ " <td>0</td>\n",
129
+ " <td>0</td>\n",
130
+ " <td>0</td>\n",
131
+ " <td>0</td>\n",
132
+ " </tr>\n",
133
+ " </tbody>\n",
134
+ "</table>\n",
135
+ "</div>"
136
+ ],
137
+ "text/plain": [
138
+ " id comment_text toxic \\\n",
139
+ "0 0000997932d777bf Explanation\\nWhy the edits made under my usern... 0 \n",
140
+ "\n",
141
+ " severe_toxic obscene threat insult identity_hate \n",
142
+ "0 0 0 0 0 0 "
143
+ ]
144
+ },
145
+ "execution_count": 4,
146
+ "metadata": {},
147
+ "output_type": "execute_result"
148
+ }
149
+ ],
150
+ "source": [
151
+ "# Load training text and label dataset\n",
152
+ "# Preprocess data\n",
153
+ "\n",
154
+ "#test_texts = pd.read_csv(\"test.csv\").values.tolist()\n",
155
+ "#test_labels = pd.read_csv('test_labels.csv').values.tolist()\n",
156
+ "\n",
157
+ "train = pd.read_csv('train.csv')\n",
158
+ "train.head(1)"
159
+ ]
160
+ },
161
+ {
162
+ "cell_type": "code",
163
+ "execution_count": 5,
164
+ "metadata": {},
165
+ "outputs": [
166
+ {
167
+ "name": "stdout",
168
+ "output_type": "stream",
169
+ "text": [
170
+ "159571 (159571, 8)\n",
171
+ "id False\n",
172
+ "comment_text False\n",
173
+ "toxic False\n",
174
+ "severe_toxic False\n",
175
+ "obscene False\n",
176
+ "threat False\n",
177
+ "insult False\n",
178
+ "identity_hate False\n",
179
+ "dtype: bool\n",
180
+ "False\n"
181
+ ]
182
+ }
183
+ ],
184
+ "source": [
185
+ "# Any duplicates?\n",
186
+ "print(len(train['comment_text'].unique()), train.shape)\n",
187
+ "\n",
188
+ "# Any missing values?\n",
189
+ "print(train.isnull().any())\n",
190
+ "print(train.isnull().values.any())"
191
+ ]
192
+ },
193
+ {
194
+ "cell_type": "code",
195
+ "execution_count": 6,
196
+ "metadata": {},
197
+ "outputs": [
198
+ {
199
+ "data": {
200
+ "text/html": [
201
+ "<div>\n",
202
+ "<style scoped>\n",
203
+ " .dataframe tbody tr th:only-of-type {\n",
204
+ " vertical-align: middle;\n",
205
+ " }\n",
206
+ "\n",
207
+ " .dataframe tbody tr th {\n",
208
+ " vertical-align: top;\n",
209
+ " }\n",
210
+ "\n",
211
+ " .dataframe thead th {\n",
212
+ " text-align: right;\n",
213
+ " }\n",
214
+ "</style>\n",
215
+ "<table border=\"1\" class=\"dataframe\">\n",
216
+ " <thead>\n",
217
+ " <tr style=\"text-align: right;\">\n",
218
+ " <th></th>\n",
219
+ " <th>id</th>\n",
220
+ " <th>comment_text</th>\n",
221
+ " <th>toxic</th>\n",
222
+ " <th>severe_toxic</th>\n",
223
+ " <th>obscene</th>\n",
224
+ " <th>threat</th>\n",
225
+ " <th>insult</th>\n",
226
+ " <th>identity_hate</th>\n",
227
+ " <th>grouped_labels</th>\n",
228
+ " </tr>\n",
229
+ " </thead>\n",
230
+ " <tbody>\n",
231
+ " <tr>\n",
232
+ " <th>0</th>\n",
233
+ " <td>0000997932d777bf</td>\n",
234
+ " <td>Explanation\\nWhy the edits made under my usern...</td>\n",
235
+ " <td>0</td>\n",
236
+ " <td>0</td>\n",
237
+ " <td>0</td>\n",
238
+ " <td>0</td>\n",
239
+ " <td>0</td>\n",
240
+ " <td>0</td>\n",
241
+ " <td>[0, 0, 0, 0, 0, 0]</td>\n",
242
+ " </tr>\n",
243
+ " </tbody>\n",
244
+ "</table>\n",
245
+ "</div>"
246
+ ],
247
+ "text/plain": [
248
+ " id comment_text toxic \\\n",
249
+ "0 0000997932d777bf Explanation\\nWhy the edits made under my usern... 0 \n",
250
+ "\n",
251
+ " severe_toxic obscene threat insult identity_hate grouped_labels \n",
252
+ "0 0 0 0 0 0 [0, 0, 0, 0, 0, 0] "
253
+ ]
254
+ },
255
+ "execution_count": 6,
256
+ "metadata": {},
257
+ "output_type": "execute_result"
258
+ }
259
+ ],
260
+ "source": [
261
+ "# Group labels to get right format for training\n",
262
+ "labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']\n",
263
+ "train['grouped_labels'] = train[labels].values.tolist()\n",
264
+ "train.head(1)"
265
+ ]
266
+ },
267
+ {
268
+ "cell_type": "code",
269
+ "execution_count": 7,
270
+ "metadata": {},
271
+ "outputs": [],
272
+ "source": [
273
+ "# Convert to list from dataframe\n",
274
+ "train_texts = train['comment_text'].values.tolist()\n",
275
+ "train_labels = train['grouped_labels'].values.tolist()"
276
+ ]
277
+ },
278
+ {
279
+ "cell_type": "code",
280
+ "execution_count": 8,
281
+ "metadata": {
282
+ "id": "vkxJ6NkFlc46",
283
+ "tags": []
284
+ },
285
+ "outputs": [],
286
+ "source": [
287
+ "# Use distilbert, a faster model of BERT which keeps 95% of the performance\n",
288
+ "model_name = \"bert-base-uncased\"\n",
289
+ "tokenizer = AutoTokenizer.from_pretrained(model_name)"
290
+ ]
291
+ },
292
+ {
293
+ "cell_type": "code",
294
+ "execution_count": 9,
295
+ "metadata": {},
296
+ "outputs": [
297
+ {
298
+ "name": "stdout",
299
+ "output_type": "stream",
300
+ "text": [
301
+ "[1, 0, 1, 1, 0, 0] 11\n",
302
+ "[1, 1, 0, 1, 0, 0] 11\n",
303
+ "[1, 0, 0, 1, 0, 1] 7\n",
304
+ "[1, 1, 0, 0, 1, 1] 7\n",
305
+ "[1, 1, 1, 0, 0, 1] 6\n",
306
+ "[1, 1, 1, 1, 0, 0] 4\n",
307
+ "[0, 0, 0, 1, 1, 0] 3\n",
308
+ "[1, 0, 0, 1, 1, 1] 3\n",
309
+ "[1, 1, 0, 0, 0, 1] 3\n",
310
+ "[0, 0, 1, 0, 0, 1] 3\n",
311
+ "[0, 0, 1, 1, 0, 0] 2\n",
312
+ "[0, 0, 1, 1, 1, 0] 2\n",
313
+ "[1, 1, 0, 1, 1, 0] 1\n",
314
+ "[1, 1, 0, 1, 0, 1] 1\n",
315
+ "Name: grouped_labels, dtype: int64\n",
316
+ "df label indices with only one instance: [159029, 158498, 157010, 154553, 149180, 144159, 139501, 138026, 134459, 133505, 127410, 120395, 115766, 113304, 110056, 107881, 107096, 101089, 98699, 86746, 76454, 74607, 68264, 66350, 63687, 61934, 57594, 53408, 45101, 41461, 36141, 31191, 30566, 29445, 23374, 17187, 15977, 9487, 8979, 6316, 6063, 2374]\n"
317
+ ]
318
+ }
319
+ ],
320
+ "source": [
321
+ "# Also do preprocessing to see if there are any unique rows\n",
322
+ "# with that specfic combination of labels\n",
323
+ "# If that is the case, we want to include that row in the training data\n",
324
+ "\n",
325
+ "# Find unique label combinations\n",
326
+ "label_counts = train['grouped_labels'].astype(str).value_counts()\n",
327
+ "print(label_counts[-14:])\n",
328
+ "\n",
329
+ "# Take low frequency labels\n",
330
+ "low_freq = label_counts[label_counts<10].keys()\n",
331
+ "low_freq_inds = sorted(list(train[train['grouped_labels'].astype(str).isin(low_freq)].index), reverse=True)\n",
332
+ "print('df label indices with only one instance: ', low_freq_inds)"
333
+ ]
334
+ },
335
+ {
336
+ "cell_type": "code",
337
+ "execution_count": 10,
338
+ "metadata": {},
339
+ "outputs": [],
340
+ "source": [
341
+ "low_freq_train_texts = [train_texts.pop(i) for i in low_freq_inds]\n",
342
+ "low_freq_train_labels = [train_labels.pop(i) for i in low_freq_inds]"
343
+ ]
344
+ },
345
+ {
346
+ "cell_type": "code",
347
+ "execution_count": 11,
348
+ "metadata": {},
349
+ "outputs": [],
350
+ "source": [
351
+ "# Add low freq values to training data\n",
352
+ "train_texts.extend(low_freq_train_texts)\n",
353
+ "train_labels.extend(low_freq_train_labels)"
354
+ ]
355
+ },
356
+ {
357
+ "cell_type": "code",
358
+ "execution_count": 12,
359
+ "metadata": {},
360
+ "outputs": [],
361
+ "source": [
362
+ "# Split datasets for training\n",
363
+ "train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.1)"
364
+ ]
365
+ },
366
+ {
367
+ "cell_type": "code",
368
+ "execution_count": 13,
369
+ "metadata": {},
370
+ "outputs": [],
371
+ "source": [
372
+ "# Shorten token to increase training speed, average is below this\n",
373
+ "max_length = 100\n",
374
+ "train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_tensors=\"pt\", max_length=max_length).to(device)\n",
375
+ "val_encodings = tokenizer(val_texts, truncation=True, padding=True, return_tensors=\"pt\", max_length=max_length).to(device)"
376
+ ]
377
+ },
378
+ {
379
+ "cell_type": "code",
380
+ "execution_count": 14,
381
+ "metadata": {},
382
+ "outputs": [],
383
+ "source": [
384
+ "class ToxicDataset(Dataset):\n",
385
+ " def __init__(self, encodings, labels):\n",
386
+ " self.encodings = encodings\n",
387
+ " self.labels = [[float(y) for y in x] for x in labels]\n",
388
+ "\n",
389
+ " def __getitem__(self, idx):\n",
390
+ " item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}\n",
391
+ " item['labels'] = torch.tensor(self.labels[idx])\n",
392
+ " return item\n",
393
+ "\n",
394
+ " def __len__(self):\n",
395
+ " return len(self.labels)"
396
+ ]
397
+ },
398
+ {
399
+ "cell_type": "code",
400
+ "execution_count": 15,
401
+ "metadata": {},
402
+ "outputs": [],
403
+ "source": [
404
+ "train_dataset = ToxicDataset(train_encodings, train_labels)\n",
405
+ "val_dataset = ToxicDataset(val_encodings, val_labels)"
406
+ ]
407
+ },
408
+ {
409
+ "cell_type": "code",
410
+ "execution_count": 16,
411
+ "metadata": {
412
+ "tags": []
413
+ },
414
+ "outputs": [
415
+ {
416
+ "name": "stderr",
417
+ "output_type": "stream",
418
+ "text": [
419
+ "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']\n",
420
+ "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
421
+ "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
422
+ "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
423
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
424
+ ]
425
+ }
426
+ ],
427
+ "source": [
428
+ "model = AutoModelForSequenceClassification.from_pretrained(model_name,\n",
429
+ " num_labels=6,\n",
430
+ " ).to(device)"
431
+ ]
432
+ },
433
+ {
434
+ "cell_type": "code",
435
+ "execution_count": 56,
436
+ "metadata": {
437
+ "collapsed": true,
438
+ "id": "CI2B0V5D27gA",
439
+ "jupyter": {
440
+ "outputs_hidden": true
441
+ },
442
+ "tags": []
443
+ },
444
+ "outputs": [
445
+ {
446
+ "name": "stderr",
447
+ "output_type": "stream",
448
+ "text": [
449
+ "The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).\n",
450
+ "PyTorch: setting up devices\n",
451
+ "***** Running training *****\n",
452
+ " Num examples = 127656\n",
453
+ " Num Epochs = 1\n",
454
+ " Instantaneous batch size per device = 16\n",
455
+ " Total train batch size (w. parallel, distributed & accumulation) = 16\n",
456
+ " Gradient Accumulation steps = 1\n",
457
+ " Total optimization steps = 7979\n",
458
+ " Number of trainable parameters = 109486854\n"
459
+ ]
460
+ },
461
+ {
462
+ "data": {
463
+ "text/html": [
464
+ "\n",
465
+ " <div>\n",
466
+ " \n",
467
+ " <progress value='33' max='7979' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
468
+ " [ 33/7979 00:21 < 1:33:06, 1.42 it/s, Epoch 0.00/1]\n",
469
+ " </div>\n",
470
+ " <table border=\"1\" class=\"dataframe\">\n",
471
+ " <thead>\n",
472
+ " <tr style=\"text-align: left;\">\n",
473
+ " <th>Step</th>\n",
474
+ " <th>Training Loss</th>\n",
475
+ " </tr>\n",
476
+ " </thead>\n",
477
+ " <tbody>\n",
478
+ " <tr>\n",
479
+ " <td>10</td>\n",
480
+ " <td>0.605800</td>\n",
481
+ " </tr>\n",
482
+ " <tr>\n",
483
+ " <td>20</td>\n",
484
+ " <td>0.590100</td>\n",
485
+ " </tr>\n",
486
+ " <tr>\n",
487
+ " <td>30</td>\n",
488
+ " <td>0.550200</td>\n",
489
+ " </tr>\n",
490
+ " </tbody>\n",
491
+ "</table><p>"
492
+ ],
493
+ "text/plain": [
494
+ "<IPython.core.display.HTML object>"
495
+ ]
496
+ },
497
+ "metadata": {},
498
+ "output_type": "display_data"
499
+ },
500
+ {
501
+ "ename": "KeyboardInterrupt",
502
+ "evalue": "",
503
+ "output_type": "error",
504
+ "traceback": [
505
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
506
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
507
+ "Cell \u001b[0;32mIn[56], line 28\u001b[0m\n\u001b[1;32m 9\u001b[0m training_args \u001b[38;5;241m=\u001b[39m TrainingArgumentsWithMPSSupport(\n\u001b[1;32m 10\u001b[0m output_dir \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m./results\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m 11\u001b[0m num_train_epochs\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 18\u001b[0m logging_steps\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m10\u001b[39m,\n\u001b[1;32m 19\u001b[0m )\n\u001b[1;32m 21\u001b[0m trainer \u001b[38;5;241m=\u001b[39m Trainer(\n\u001b[1;32m 22\u001b[0m model\u001b[38;5;241m=\u001b[39mmodel,\n\u001b[1;32m 23\u001b[0m args\u001b[38;5;241m=\u001b[39mtraining_args,\n\u001b[1;32m 24\u001b[0m train_dataset\u001b[38;5;241m=\u001b[39mtrain_dataset,\n\u001b[1;32m 25\u001b[0m eval_dataset\u001b[38;5;241m=\u001b[39mval_dataset,\n\u001b[1;32m 26\u001b[0m )\n\u001b[0;32m---> 28\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
508
+ "File \u001b[0;32m~/anaconda3/lib/python3.10/site-packages/transformers/trainer.py:1501\u001b[0m, in \u001b[0;36mTrainer.train\u001b[0;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[0m\n\u001b[1;32m 1496\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel_wrapped \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel\n\u001b[1;32m 1498\u001b[0m inner_training_loop \u001b[38;5;241m=\u001b[39m find_executable_batch_size(\n\u001b[1;32m 1499\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_inner_training_loop, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_train_batch_size, args\u001b[38;5;241m.\u001b[39mauto_find_batch_size\n\u001b[1;32m 1500\u001b[0m )\n\u001b[0;32m-> 1501\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43minner_training_loop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1502\u001b[0m \u001b[43m \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1503\u001b[0m \u001b[43m \u001b[49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1504\u001b[0m \u001b[43m \u001b[49m\u001b[43mtrial\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrial\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1505\u001b[0m \u001b[43m \u001b[49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1506\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
509
+ "File \u001b[0;32m~/anaconda3/lib/python3.10/site-packages/transformers/trainer.py:1749\u001b[0m, in \u001b[0;36mTrainer._inner_training_loop\u001b[0;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[0m\n\u001b[1;32m 1747\u001b[0m tr_loss_step \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtraining_step(model, inputs)\n\u001b[1;32m 1748\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1749\u001b[0m tr_loss_step \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtraining_step\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1751\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m 1752\u001b[0m args\u001b[38;5;241m.\u001b[39mlogging_nan_inf_filter\n\u001b[1;32m 1753\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_torch_tpu_available()\n\u001b[1;32m 1754\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m (torch\u001b[38;5;241m.\u001b[39misnan(tr_loss_step) \u001b[38;5;129;01mor\u001b[39;00m torch\u001b[38;5;241m.\u001b[39misinf(tr_loss_step))\n\u001b[1;32m 1755\u001b[0m ):\n\u001b[1;32m 1756\u001b[0m \u001b[38;5;66;03m# if loss is nan or inf simply add the average of previous logged losses\u001b[39;00m\n\u001b[1;32m 1757\u001b[0m tr_loss \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m tr_loss \u001b[38;5;241m/\u001b[39m (\u001b[38;5;241m1\u001b[39m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mglobal_step \u001b[38;5;241m-\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_globalstep_last_logged)\n",
510
+ "File \u001b[0;32m~/anaconda3/lib/python3.10/site-packages/transformers/trainer.py:2526\u001b[0m, in \u001b[0;36mTrainer.training_step\u001b[0;34m(self, model, inputs)\u001b[0m\n\u001b[1;32m 2524\u001b[0m loss \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdeepspeed\u001b[38;5;241m.\u001b[39mbackward(loss)\n\u001b[1;32m 2525\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 2526\u001b[0m \u001b[43mloss\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2528\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m loss\u001b[38;5;241m.\u001b[39mdetach()\n",
511
+ "File \u001b[0;32m~/anaconda3/lib/python3.10/site-packages/torch/_tensor.py:488\u001b[0m, in \u001b[0;36mTensor.backward\u001b[0;34m(self, gradient, retain_graph, create_graph, inputs)\u001b[0m\n\u001b[1;32m 478\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m has_torch_function_unary(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m 479\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m handle_torch_function(\n\u001b[1;32m 480\u001b[0m Tensor\u001b[38;5;241m.\u001b[39mbackward,\n\u001b[1;32m 481\u001b[0m (\u001b[38;5;28mself\u001b[39m,),\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 486\u001b[0m inputs\u001b[38;5;241m=\u001b[39minputs,\n\u001b[1;32m 487\u001b[0m )\n\u001b[0;32m--> 488\u001b[0m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mautograd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 489\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgradient\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs\u001b[49m\n\u001b[1;32m 490\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
512
+ "File \u001b[0;32m~/anaconda3/lib/python3.10/site-packages/torch/autograd/__init__.py:204\u001b[0m, in \u001b[0;36mbackward\u001b[0;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001b[0m\n\u001b[1;32m 199\u001b[0m retain_graph \u001b[38;5;241m=\u001b[39m create_graph\n\u001b[1;32m 201\u001b[0m \u001b[38;5;66;03m# The reason we repeat the same comment below is that\u001b[39;00m\n\u001b[1;32m 202\u001b[0m \u001b[38;5;66;03m# some Python versions print out the first line of a multi-line function\u001b[39;00m\n\u001b[1;32m 203\u001b[0m \u001b[38;5;66;03m# calls in the traceback and some print out the last line\u001b[39;00m\n\u001b[0;32m--> 204\u001b[0m \u001b[43mVariable\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_execution_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun_backward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Calls into the C++ engine to run the backward pass\u001b[39;49;00m\n\u001b[1;32m 205\u001b[0m \u001b[43m \u001b[49m\u001b[43mtensors\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgrad_tensors_\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 206\u001b[0m \u001b[43m \u001b[49m\u001b[43mallow_unreachable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maccumulate_grad\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n",
513
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
514
+ ]
515
+ }
516
+ ],
517
+ "source": [
518
+ "class TrainingArgumentsWithMPSSupport(TrainingArguments):\n",
519
+ " @property\n",
520
+ " def device(self) -> torch.device:\n",
521
+ " if torch.backends.mps.is_available():\n",
522
+ " return torch.device(\"mps\")\n",
523
+ " else:\n",
524
+ " return torch.device(\"cpu\")\n",
525
+ "\n",
526
+ "training_args = TrainingArgumentsWithMPSSupport(\n",
527
+ " output_dir = './results',\n",
528
+ " num_train_epochs=1,\n",
529
+ " per_device_train_batch_size=16,\n",
530
+ " per_device_eval_batch_size=16,\n",
531
+ " warmup_steps=500,\n",
532
+ " learning_rate=5e-5,\n",
533
+ " weight_decay=0.01,\n",
534
+ " logging_dir='./logs',\n",
535
+ " logging_steps=10,\n",
536
+ ")\n",
537
+ "\n",
538
+ "trainer = Trainer(\n",
539
+ " model=model,\n",
540
+ " args=training_args,\n",
541
+ " train_dataset=train_dataset,\n",
542
+ " eval_dataset=val_dataset,\n",
543
+ ")\n",
544
+ "\n",
545
+ "trainer.train()"
546
+ ]
547
+ },
548
+ {
549
+ "cell_type": "code",
550
+ "execution_count": 21,
551
+ "metadata": {},
552
+ "outputs": [
553
+ {
554
+ "name": "stderr",
555
+ "output_type": "stream",
556
+ "text": [
557
+ "Saving model checkpoint to ./model_checkpoint/done\n",
558
+ "Configuration saved in ./model_checkpoint/done/config.json\n",
559
+ "Model weights saved in ./model_checkpoint/done/pytorch_model.bin\n"
560
+ ]
561
+ }
562
+ ],
563
+ "source": [
564
+ "trainer.save_model('./model_checkpoint/done')"
565
+ ]
566
+ },
567
+ {
568
+ "cell_type": "code",
569
+ "execution_count": 18,
570
+ "metadata": {
571
+ "tags": []
572
+ },
573
+ "outputs": [],
574
+ "source": [
575
+ "from transformers import BertTokenizer, BertForSequenceClassification\n",
576
+ "#saved = DistilBertModel.from_pretrained('./model_checkpoint/trained', num_labels=6, problem_type=\"multi_label_classification\")\n",
577
+ "saved = BertForSequenceClassification.from_pretrained('./model_checkpoint/fine_tuned')"
578
+ ]
579
+ },
580
+ {
581
+ "cell_type": "code",
582
+ "execution_count": 19,
583
+ "metadata": {},
584
+ "outputs": [
585
+ {
586
+ "ename": "NameError",
587
+ "evalue": "name 'trainer' is not defined",
588
+ "output_type": "error",
589
+ "traceback": [
590
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
591
+ "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
592
+ "Cell \u001b[0;32mIn[19], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241m.\u001b[39mevaluate()\n",
593
+ "\u001b[0;31mNameError\u001b[0m: name 'trainer' is not defined"
594
+ ]
595
+ }
596
+ ],
597
+ "source": [
598
+ "trainer.evaluate()"
599
+ ]
600
+ },
601
+ {
602
+ "cell_type": "code",
603
+ "execution_count": 59,
604
+ "metadata": {},
605
+ "outputs": [
606
+ {
607
+ "data": {
608
+ "text/plain": [
609
+ "[[0.4601849317550659,\n",
610
+ " 0.0626736581325531,\n",
611
+ " 0.1962047964334488,\n",
612
+ " 0.0715285912156105,\n",
613
+ " 0.1363525241613388,\n",
614
+ " 0.0730554461479187]]"
615
+ ]
616
+ },
617
+ "execution_count": 59,
618
+ "metadata": {},
619
+ "output_type": "execute_result"
620
+ }
621
+ ],
622
+ "source": [
623
+ "text = \"fun\"\n",
624
+ "encoded_input = tokenizer(text, return_tensors=\"pt\")\n",
625
+ "outputs = saved(**encoded_input)\n",
626
+ "predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)\n",
627
+ "predictions = predictions.cpu().detach().numpy()\n",
628
+ "predictions.tolist()"
629
+ ]
630
+ },
631
+ {
632
+ "cell_type": "code",
633
+ "execution_count": 48,
634
+ "metadata": {},
635
+ "outputs": [],
636
+ "source": [
637
+ "res = [1 if x >= 0.5 else 0 for x in predictions[0]]"
638
+ ]
639
+ },
640
+ {
641
+ "cell_type": "code",
642
+ "execution_count": 49,
643
+ "metadata": {},
644
+ "outputs": [
645
+ {
646
+ "data": {
647
+ "text/plain": [
648
+ "[1, 0, 0, 0, 0, 0]"
649
+ ]
650
+ },
651
+ "execution_count": 49,
652
+ "metadata": {},
653
+ "output_type": "execute_result"
654
+ }
655
+ ],
656
+ "source": [
657
+ "res"
658
+ ]
659
+ }
660
+ ],
661
+ "metadata": {
662
+ "colab": {
663
+ "provenance": []
664
+ },
665
+ "kernelspec": {
666
+ "display_name": "Python 3 (ipykernel)",
667
+ "language": "python",
668
+ "name": "python3"
669
+ },
670
+ "language_info": {
671
+ "codemirror_mode": {
672
+ "name": "ipython",
673
+ "version": 3
674
+ },
675
+ "file_extension": ".py",
676
+ "mimetype": "text/x-python",
677
+ "name": "python",
678
+ "nbconvert_exporter": "python",
679
+ "pygments_lexer": "ipython3",
680
+ "version": "3.10.9"
681
+ }
682
+ },
683
+ "nbformat": 4,
684
+ "nbformat_minor": 4
685
+ }
app.py CHANGED
@@ -1,9 +1,11 @@
1
  import streamlit as st
2
  from transformers import pipeline
3
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
 
 
 
4
 
5
-
6
- models = ["cardiffnlp/twitter-xlm-roberta-base-sentiment", "nlptown/bert-base-multilingual-uncased-sentiment", "Tatyana/rubert-base-cased-sentiment-new"]
7
 
8
 
9
 
@@ -15,9 +17,20 @@ with st.form("form"):
15
 
16
  if submitted:
17
  model_name = models[models.index(selection)]
18
- model = AutoModelForSequenceClassification.from_pretrained(model_name)
19
- tokenizer = AutoTokenizer.from_pretrained(model_name)
20
- classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
21
- result = classifier(text)
22
- st.write("Label:", result[0]["label"])
23
- st.write('Score: ', result[0]['score'])
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  from transformers import pipeline
3
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
4
+ from transformers import BertTokenizer, BertForSequenceClassification
5
+ from huggingface_hub.inference_api import InferenceApi
6
+ import os
7
 
8
+ models = ["cardiffnlp/twitter-xlm-roberta-base-sentiment", "nlptown/bert-base-multilingual-uncased-sentiment", "Tatyana/rubert-base-cased-sentiment-new", "junming-qiu/BertToxicClassifier"]
 
9
 
10
 
11
 
 
17
 
18
  if submitted:
19
  model_name = models[models.index(selection)]
20
+
21
+ if model_name == "junming-qiu/BertToxicClassifier":
22
+ API_TOKEN=os.environ['API-KEY']
23
+ inference = InferenceApi(repo_id=model_name, token=API_TOKEN)
24
+ predictions = inference(inputs=text)[0]
25
+ predictions = sorted(predictions, key=lambda x: x['score'], reverse=True)
26
+ st.write(predictions[0]['label']+":", predictions[0]['score'])
27
+ st.write(predictions[1]['label']+":", predictions[1]['score'])
28
+ else:
29
+
30
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
31
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
32
+ classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
33
+ result = classifier(text)
34
+ st.write("Label:", result[0]["label"])
35
+ st.write('Score: ', result[0]['score'])
36
+
init.bash CHANGED
@@ -3,4 +3,4 @@ sudo apt install python3-pip
3
  pip install streamlit
4
  pip install transformers datasets
5
  pip install torch
6
- pip install sentencepiece
 
3
  pip install streamlit
4
  pip install transformers datasets
5
  pip install torch
6
+ pip install sentencepiece
requirements.txt CHANGED
@@ -2,4 +2,5 @@ streamlit
2
  transformers
3
  datasets
4
  torch
5
- sentencepiece
 
 
2
  transformers
3
  datasets
4
  torch
5
+ sentencepiece
6
+ torch