'mint autosave'
Browse files- .gitignore +3 -0
- Copy of training.ipynb +0 -334
- Copy_of_Copy_of_training.ipynb +0 -345
- logs/1682300361.4426298/events.out.tfevents.1682300361.mint.371280.1 +0 -0
- logs/1682300884.6095285/events.out.tfevents.1682300884.mint.371280.3 +0 -0
- logs/1682300938.1223385/events.out.tfevents.1682300938.mint.371280.5 +0 -0
- logs/1682301013.2686887/events.out.tfevents.1682301013.mint.371280.7 +0 -0
- logs/events.out.tfevents.1682300361.mint.371280.0 +0 -0
- logs/events.out.tfevents.1682300884.mint.371280.2 +0 -0
- logs/events.out.tfevents.1682300938.mint.371280.4 +0 -0
- logs/events.out.tfevents.1682301013.mint.371280.6 +0 -0
- train.py +0 -138
- training.ipynb +0 -164
- traintokens.txt +0 -0
- working_training.ipynb +0 -0
.gitignore
CHANGED
@@ -1,2 +1,5 @@
|
|
1 |
results/**
|
|
|
2 |
data/**
|
|
|
|
|
|
1 |
results/**
|
2 |
+
<<<<<<< HEAD
|
3 |
data/**
|
4 |
+
=======
|
5 |
+
>>>>>>> f375d50 ('mint autosave')
|
Copy of training.ipynb
DELETED
@@ -1,334 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"cells": [
|
3 |
-
{
|
4 |
-
"cell_type": "code",
|
5 |
-
"execution_count": 1,
|
6 |
-
"id": "215a1aae",
|
7 |
-
"metadata": {
|
8 |
-
"executionInfo": {
|
9 |
-
"elapsed": 128,
|
10 |
-
"status": "ok",
|
11 |
-
"timestamp": 1682285319377,
|
12 |
-
"user": {
|
13 |
-
"displayName": "",
|
14 |
-
"userId": ""
|
15 |
-
},
|
16 |
-
"user_tz": 240
|
17 |
-
},
|
18 |
-
"id": "215a1aae"
|
19 |
-
},
|
20 |
-
"outputs": [
|
21 |
-
{
|
22 |
-
"name": "stderr",
|
23 |
-
"output_type": "stream",
|
24 |
-
"text": [
|
25 |
-
"2023-04-23 18:07:24.557548: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
|
26 |
-
"To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
|
27 |
-
"2023-04-23 18:07:25.431969: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
|
28 |
-
]
|
29 |
-
}
|
30 |
-
],
|
31 |
-
"source": [
|
32 |
-
"import torch\n",
|
33 |
-
"from torch.utils.data import Dataset, DataLoader\n",
|
34 |
-
"\n",
|
35 |
-
"import pandas as pd\n",
|
36 |
-
"\n",
|
37 |
-
"from transformers import BertTokenizerFast, BertForSequenceClassification\n",
|
38 |
-
"from transformers import Trainer, TrainingArguments"
|
39 |
-
]
|
40 |
-
},
|
41 |
-
{
|
42 |
-
"cell_type": "code",
|
43 |
-
"execution_count": 2,
|
44 |
-
"id": "J5Tlgp4tNd0U",
|
45 |
-
"metadata": {
|
46 |
-
"colab": {
|
47 |
-
"base_uri": "https://localhost:8080/"
|
48 |
-
},
|
49 |
-
"executionInfo": {
|
50 |
-
"elapsed": 1897,
|
51 |
-
"status": "ok",
|
52 |
-
"timestamp": 1682285321454,
|
53 |
-
"user": {
|
54 |
-
"displayName": "",
|
55 |
-
"userId": ""
|
56 |
-
},
|
57 |
-
"user_tz": 240
|
58 |
-
},
|
59 |
-
"id": "J5Tlgp4tNd0U",
|
60 |
-
"outputId": "3c9f0c5b-7bc3-4c15-c5ff-0a77d3b3b607"
|
61 |
-
},
|
62 |
-
"outputs": [
|
63 |
-
{
|
64 |
-
"name": "stderr",
|
65 |
-
"output_type": "stream",
|
66 |
-
"text": [
|
67 |
-
"Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']\n",
|
68 |
-
"- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
|
69 |
-
"- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
|
70 |
-
"Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
|
71 |
-
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
|
72 |
-
]
|
73 |
-
}
|
74 |
-
],
|
75 |
-
"source": [
|
76 |
-
"model_name = \"bert-base-uncased\"\n",
|
77 |
-
"tokenizer = BertTokenizerFast.from_pretrained(model_name)\n",
|
78 |
-
"model = BertForSequenceClassification.from_pretrained(model_name, num_labels=6)\n",
|
79 |
-
"max_len = 200\n",
|
80 |
-
"\n",
|
81 |
-
"training_args = TrainingArguments(\n",
|
82 |
-
" output_dir=\"results\",\n",
|
83 |
-
" num_train_epochs=1,\n",
|
84 |
-
" per_device_train_batch_size=16,\n",
|
85 |
-
" per_device_eval_batch_size=64,\n",
|
86 |
-
" warmup_steps=500,\n",
|
87 |
-
" learning_rate=5e-5,\n",
|
88 |
-
" weight_decay=0.01,\n",
|
89 |
-
" logging_dir=\"./logs\",\n",
|
90 |
-
" logging_steps=10\n",
|
91 |
-
" )\n",
|
92 |
-
"\n",
|
93 |
-
"# dataset class that inherits from torch.utils.data.Dataset\n",
|
94 |
-
"class TweetDataset(Dataset):\n",
|
95 |
-
" def __init__(self, encodings, labels):\n",
|
96 |
-
" self.encodings = encodings\n",
|
97 |
-
" self.labels = labels\n",
|
98 |
-
" self.tok = tokenizer\n",
|
99 |
-
" \n",
|
100 |
-
" def __getitem__(self, idx):\n",
|
101 |
-
" # encoding = self.tok(self.encodings[idx], truncation=True, padding=\"max_length\", max_length=max_len)\n",
|
102 |
-
" item = { key: torch.tensor(val[idx]) for key, val in self.encoding.items() }\n",
|
103 |
-
" item['labels'] = torch.tensor(self.labels[idx])\n",
|
104 |
-
" return item\n",
|
105 |
-
" \n",
|
106 |
-
" def __len__(self):\n",
|
107 |
-
" return len(self.labels)\n",
|
108 |
-
" \n",
|
109 |
-
"class TokenizerDataset(Dataset):\n",
|
110 |
-
" def __init__(self, strings):\n",
|
111 |
-
" self.strings = strings\n",
|
112 |
-
" \n",
|
113 |
-
" def __getitem__(self, idx):\n",
|
114 |
-
" return self.strings[idx]\n",
|
115 |
-
" \n",
|
116 |
-
" def __len__(self):\n",
|
117 |
-
" return len(self.strings)\n",
|
118 |
-
" "
|
119 |
-
]
|
120 |
-
},
|
121 |
-
{
|
122 |
-
"cell_type": "code",
|
123 |
-
"execution_count": 3,
|
124 |
-
"id": "9969c58c",
|
125 |
-
"metadata": {
|
126 |
-
"executionInfo": {
|
127 |
-
"elapsed": 5145,
|
128 |
-
"status": "ok",
|
129 |
-
"timestamp": 1682285326593,
|
130 |
-
"user": {
|
131 |
-
"displayName": "",
|
132 |
-
"userId": ""
|
133 |
-
},
|
134 |
-
"user_tz": 240
|
135 |
-
},
|
136 |
-
"id": "9969c58c",
|
137 |
-
"scrolled": false
|
138 |
-
},
|
139 |
-
"outputs": [],
|
140 |
-
"source": [
|
141 |
-
"train_data = pd.read_csv(\"data/train.csv\")\n",
|
142 |
-
"train_text = train_data[\"comment_text\"]\n",
|
143 |
-
"train_labels = train_data[[\"toxic\", \"severe_toxic\", \n",
|
144 |
-
" \"obscene\", \"threat\", \n",
|
145 |
-
" \"insult\", \"identity_hate\"]]\n",
|
146 |
-
"\n",
|
147 |
-
"test_text = pd.read_csv(\"data/test.csv\")[\"comment_text\"]\n",
|
148 |
-
"test_labels = pd.read_csv(\"data/test_labels.csv\")[[\n",
|
149 |
-
" \"toxic\", \"severe_toxic\", \n",
|
150 |
-
" \"obscene\", \"threat\", \n",
|
151 |
-
" \"insult\", \"identity_hate\"]]\n",
|
152 |
-
"\n",
|
153 |
-
"# data preprocessing\n",
|
154 |
-
"\n",
|
155 |
-
"\n",
|
156 |
-
"\n",
|
157 |
-
"train_text = train_text.values.tolist()\n",
|
158 |
-
"train_labels = train_labels.values.tolist()\n",
|
159 |
-
"test_text = test_text.values.tolist()\n",
|
160 |
-
"test_labels = test_labels.values.tolist()\n"
|
161 |
-
]
|
162 |
-
},
|
163 |
-
{
|
164 |
-
"cell_type": "code",
|
165 |
-
"execution_count": null,
|
166 |
-
"id": "1n56TME9Njde",
|
167 |
-
"metadata": {
|
168 |
-
"executionInfo": {
|
169 |
-
"elapsed": 12,
|
170 |
-
"status": "ok",
|
171 |
-
"timestamp": 1682285326594,
|
172 |
-
"user": {
|
173 |
-
"displayName": "",
|
174 |
-
"userId": ""
|
175 |
-
},
|
176 |
-
"user_tz": 240
|
177 |
-
},
|
178 |
-
"id": "1n56TME9Njde"
|
179 |
-
},
|
180 |
-
"outputs": [],
|
181 |
-
"source": [
|
182 |
-
"# prepare tokenizer and dataset\n",
|
183 |
-
"\n",
|
184 |
-
"train_strings = TokenizerDataset(train_text)\n",
|
185 |
-
"test_strings = TokenizerDataset(test_text)\n",
|
186 |
-
"\n",
|
187 |
-
"train_dataloader = DataLoader(train_strings, batch_size=16, shuffle=True)\n",
|
188 |
-
"test_dataloader = DataLoader(test_strings, batch_size=16, shuffle=True)\n",
|
189 |
-
"\n",
|
190 |
-
"\n",
|
191 |
-
"\n",
|
192 |
-
"\n",
|
193 |
-
"# train_encodings = tokenizer.batch_encode_plus(train_text, \\\n",
|
194 |
-
"# max_length=200, pad_to_max_length=True, \\\n",
|
195 |
-
"# truncation=True, return_token_type_ids=False \\\n",
|
196 |
-
"# )\n",
|
197 |
-
"# test_encodings = tokenizer.batch_encode_plus(test_text, \\\n",
|
198 |
-
"# max_length=200, pad_to_max_length=True, \\\n",
|
199 |
-
"# truncation=True, return_token_type_ids=False \\\n",
|
200 |
-
"# )\n",
|
201 |
-
"\n",
|
202 |
-
"\n",
|
203 |
-
"train_encodings = tokenizer(train_text, truncation=True, padding=True)\n",
|
204 |
-
"test_encodings = tokenizer(test_text, truncation=True, padding=True)"
|
205 |
-
]
|
206 |
-
},
|
207 |
-
{
|
208 |
-
"cell_type": "code",
|
209 |
-
"execution_count": null,
|
210 |
-
"id": "a5c7a657",
|
211 |
-
"metadata": {},
|
212 |
-
"outputs": [],
|
213 |
-
"source": [
|
214 |
-
"f = open(\"traintokens.txt\", 'a')\n",
|
215 |
-
"f.write(train_encodings)\n",
|
216 |
-
"f.write('\\n\\n\\n\\n\\n')\n",
|
217 |
-
"f.close()\n",
|
218 |
-
"\n",
|
219 |
-
"g = open(\"testtokens.txt\", 'a')\n",
|
220 |
-
"g.write(test_encodings)\n",
|
221 |
-
"g.write('\\n\\n\\n\\n\\n')\n",
|
222 |
-
"\n",
|
223 |
-
"g.close()"
|
224 |
-
]
|
225 |
-
},
|
226 |
-
{
|
227 |
-
"cell_type": "code",
|
228 |
-
"execution_count": null,
|
229 |
-
"id": "4kwydz67qjW9",
|
230 |
-
"metadata": {
|
231 |
-
"executionInfo": {
|
232 |
-
"elapsed": 10,
|
233 |
-
"status": "ok",
|
234 |
-
"timestamp": 1682285326595,
|
235 |
-
"user": {
|
236 |
-
"displayName": "",
|
237 |
-
"userId": ""
|
238 |
-
},
|
239 |
-
"user_tz": 240
|
240 |
-
},
|
241 |
-
"id": "4kwydz67qjW9"
|
242 |
-
},
|
243 |
-
"outputs": [],
|
244 |
-
"source": [
|
245 |
-
"train_dataset = TweetDataset(train_ecnodings, train_labels)\n",
|
246 |
-
"test_dataset = TweetDataset(test_encodings, test_labels)"
|
247 |
-
]
|
248 |
-
},
|
249 |
-
{
|
250 |
-
"cell_type": "code",
|
251 |
-
"execution_count": null,
|
252 |
-
"id": "krZKjDVwNnWI",
|
253 |
-
"metadata": {
|
254 |
-
"executionInfo": {
|
255 |
-
"elapsed": 10,
|
256 |
-
"status": "ok",
|
257 |
-
"timestamp": 1682285326596,
|
258 |
-
"user": {
|
259 |
-
"displayName": "",
|
260 |
-
"userId": ""
|
261 |
-
},
|
262 |
-
"user_tz": 240
|
263 |
-
},
|
264 |
-
"id": "krZKjDVwNnWI"
|
265 |
-
},
|
266 |
-
"outputs": [],
|
267 |
-
"source": [
|
268 |
-
"# training\n",
|
269 |
-
"trainer = Trainer(\n",
|
270 |
-
" model=model, \n",
|
271 |
-
" args=training_args, \n",
|
272 |
-
" train_dataset=train_dataset, \n",
|
273 |
-
" eval_dataset=test_dataset\n",
|
274 |
-
" )"
|
275 |
-
]
|
276 |
-
},
|
277 |
-
{
|
278 |
-
"cell_type": "code",
|
279 |
-
"execution_count": null,
|
280 |
-
"id": "VwsyMZg_tgTg",
|
281 |
-
"metadata": {
|
282 |
-
"colab": {
|
283 |
-
"base_uri": "https://localhost:8080/",
|
284 |
-
"height": 416
|
285 |
-
},
|
286 |
-
"executionInfo": {
|
287 |
-
"elapsed": 27193,
|
288 |
-
"status": "error",
|
289 |
-
"timestamp": 1682285353779,
|
290 |
-
"user": {
|
291 |
-
"displayName": "",
|
292 |
-
"userId": ""
|
293 |
-
},
|
294 |
-
"user_tz": 240
|
295 |
-
},
|
296 |
-
"id": "VwsyMZg_tgTg",
|
297 |
-
"outputId": "49c3f5c8-0342-45c5-8d0f-5cd5d2d1f9e9"
|
298 |
-
},
|
299 |
-
"outputs": [],
|
300 |
-
"source": [
|
301 |
-
"trainer.train()"
|
302 |
-
]
|
303 |
-
}
|
304 |
-
],
|
305 |
-
"metadata": {
|
306 |
-
"colab": {
|
307 |
-
"provenance": [
|
308 |
-
{
|
309 |
-
"file_id": "https://github.com/joebraha/aiproject/blob/milestone-3/training.ipynb",
|
310 |
-
"timestamp": 1682285843150
|
311 |
-
}
|
312 |
-
]
|
313 |
-
},
|
314 |
-
"kernelspec": {
|
315 |
-
"display_name": "Python 3 (ipykernel)",
|
316 |
-
"language": "python",
|
317 |
-
"name": "python3"
|
318 |
-
},
|
319 |
-
"language_info": {
|
320 |
-
"codemirror_mode": {
|
321 |
-
"name": "ipython",
|
322 |
-
"version": 3
|
323 |
-
},
|
324 |
-
"file_extension": ".py",
|
325 |
-
"mimetype": "text/x-python",
|
326 |
-
"name": "python",
|
327 |
-
"nbconvert_exporter": "python",
|
328 |
-
"pygments_lexer": "ipython3",
|
329 |
-
"version": "3.10.6"
|
330 |
-
}
|
331 |
-
},
|
332 |
-
"nbformat": 4,
|
333 |
-
"nbformat_minor": 5
|
334 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Copy_of_Copy_of_training.ipynb
DELETED
@@ -1,345 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"cells": [
|
3 |
-
{
|
4 |
-
"cell_type": "code",
|
5 |
-
"execution_count": 2,
|
6 |
-
"id": "215a1aae",
|
7 |
-
"metadata": {
|
8 |
-
"id": "215a1aae"
|
9 |
-
},
|
10 |
-
"outputs": [
|
11 |
-
{
|
12 |
-
"name": "stderr",
|
13 |
-
"output_type": "stream",
|
14 |
-
"text": [
|
15 |
-
"2023-04-23 21:39:14.489766: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
|
16 |
-
"To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
|
17 |
-
"2023-04-23 21:39:15.104927: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
|
18 |
-
]
|
19 |
-
}
|
20 |
-
],
|
21 |
-
"source": [
|
22 |
-
"import torch\n",
|
23 |
-
"from torch.utils.data import Dataset, DataLoader\n",
|
24 |
-
"\n",
|
25 |
-
"import pandas as pd\n",
|
26 |
-
"\n",
|
27 |
-
"from transformers import BertTokenizerFast, BertForSequenceClassification\n",
|
28 |
-
"from transformers import Trainer, TrainingArguments"
|
29 |
-
]
|
30 |
-
},
|
31 |
-
{
|
32 |
-
"cell_type": "code",
|
33 |
-
"execution_count": 3,
|
34 |
-
"id": "J5Tlgp4tNd0U",
|
35 |
-
"metadata": {
|
36 |
-
"colab": {
|
37 |
-
"base_uri": "https://localhost:8080/"
|
38 |
-
},
|
39 |
-
"id": "J5Tlgp4tNd0U",
|
40 |
-
"outputId": "f2eef2ee-7d9d-4f5b-e35c-e6015e68f59e"
|
41 |
-
},
|
42 |
-
"outputs": [
|
43 |
-
{
|
44 |
-
"name": "stderr",
|
45 |
-
"output_type": "stream",
|
46 |
-
"text": [
|
47 |
-
"Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']\n",
|
48 |
-
"- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
|
49 |
-
"- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
|
50 |
-
"Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
|
51 |
-
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
|
52 |
-
]
|
53 |
-
}
|
54 |
-
],
|
55 |
-
"source": [
|
56 |
-
"model_name = \"bert-base-uncased\"\n",
|
57 |
-
"tokenizer = BertTokenizerFast.from_pretrained(model_name)\n",
|
58 |
-
"model = BertForSequenceClassification.from_pretrained(model_name, num_labels=6)\n",
|
59 |
-
"model = model.to(\"cuda:0\")\n",
|
60 |
-
"max_len = 200\n",
|
61 |
-
"\n",
|
62 |
-
"training_args = TrainingArguments(\n",
|
63 |
-
" output_dir=\"results\",\n",
|
64 |
-
" num_train_epochs=1,\n",
|
65 |
-
" per_device_train_batch_size=16,\n",
|
66 |
-
" per_device_eval_batch_size=64,\n",
|
67 |
-
" warmup_steps=500,\n",
|
68 |
-
" learning_rate=5e-5,\n",
|
69 |
-
" weight_decay=0.01,\n",
|
70 |
-
" logging_dir=\"./logs\",\n",
|
71 |
-
" logging_steps=10\n",
|
72 |
-
" )\n",
|
73 |
-
"\n",
|
74 |
-
"# dataset class that inherits from torch.utils.data.Dataset\n",
|
75 |
-
"\n",
|
76 |
-
" \n",
|
77 |
-
"class TokenizerDataset(Dataset):\n",
|
78 |
-
" def __init__(self, strings):\n",
|
79 |
-
" self.strings = strings\n",
|
80 |
-
" \n",
|
81 |
-
" def __getitem__(self, idx):\n",
|
82 |
-
" return self.strings[idx]\n",
|
83 |
-
" \n",
|
84 |
-
" def __len__(self):\n",
|
85 |
-
" return len(self.strings)\n",
|
86 |
-
" "
|
87 |
-
]
|
88 |
-
},
|
89 |
-
{
|
90 |
-
"cell_type": "code",
|
91 |
-
"execution_count": 4,
|
92 |
-
"id": "9969c58c",
|
93 |
-
"metadata": {
|
94 |
-
"colab": {
|
95 |
-
"base_uri": "https://localhost:8080/"
|
96 |
-
},
|
97 |
-
"id": "9969c58c",
|
98 |
-
"outputId": "5933b10b-9ddb-4b67-b66b-589207bef2d3",
|
99 |
-
"scrolled": false
|
100 |
-
},
|
101 |
-
"outputs": [
|
102 |
-
{
|
103 |
-
"name": "stdout",
|
104 |
-
"output_type": "stream",
|
105 |
-
"text": [
|
106 |
-
" id comment_text \\\n",
|
107 |
-
"0 0000997932d777bf Explanation\\nWhy the edits made under my usern... \n",
|
108 |
-
"1 000103f0d9cfb60f D'aww! He matches this background colour I'm s... \n",
|
109 |
-
"2 000113f07ec002fd Hey man, I'm really not trying to edit war. It... \n",
|
110 |
-
"3 0001b41b1c6bb37e \"\\nMore\\nI can't make any real suggestions on ... \n",
|
111 |
-
"4 0001d958c54c6e35 You, sir, are my hero. Any chance you remember... \n",
|
112 |
-
"... ... ... \n",
|
113 |
-
"159566 ffe987279560d7ff \":::::And for the second time of asking, when ... \n",
|
114 |
-
"159567 ffea4adeee384e90 You should be ashamed of yourself \\n\\nThat is ... \n",
|
115 |
-
"159568 ffee36eab5c267c9 Spitzer \\n\\nUmm, theres no actual article for ... \n",
|
116 |
-
"159569 fff125370e4aaaf3 And it looks like it was actually you who put ... \n",
|
117 |
-
"159570 fff46fc426af1f9a \"\\nAnd ... I really don't think you understand... \n",
|
118 |
-
"\n",
|
119 |
-
" toxic severe_toxic obscene threat insult identity_hate \n",
|
120 |
-
"0 0 0 0 0 0 0 \n",
|
121 |
-
"1 0 0 0 0 0 0 \n",
|
122 |
-
"2 0 0 0 0 0 0 \n",
|
123 |
-
"3 0 0 0 0 0 0 \n",
|
124 |
-
"4 0 0 0 0 0 0 \n",
|
125 |
-
"... ... ... ... ... ... ... \n",
|
126 |
-
"159566 0 0 0 0 0 0 \n",
|
127 |
-
"159567 0 0 0 0 0 0 \n",
|
128 |
-
"159568 0 0 0 0 0 0 \n",
|
129 |
-
"159569 0 0 0 0 0 0 \n",
|
130 |
-
"159570 0 0 0 0 0 0 \n",
|
131 |
-
"\n",
|
132 |
-
"[159571 rows x 8 columns]\n"
|
133 |
-
]
|
134 |
-
}
|
135 |
-
],
|
136 |
-
"source": [
|
137 |
-
"train_data = pd.read_csv(\"data/train.csv\")\n",
|
138 |
-
"print(train_data)\n",
|
139 |
-
"train_text = train_data[\"comment_text\"]\n",
|
140 |
-
"train_labels = train_data[[\"toxic\", \"severe_toxic\", \n",
|
141 |
-
" \"obscene\", \"threat\", \n",
|
142 |
-
" \"insult\", \"identity_hate\"]]\n",
|
143 |
-
"\n",
|
144 |
-
"test_text = pd.read_csv(\"data/test.csv\")[\"comment_text\"]\n",
|
145 |
-
"test_labels = pd.read_csv(\"data/test_labels.csv\")[[\n",
|
146 |
-
" \"toxic\", \"severe_toxic\", \n",
|
147 |
-
" \"obscene\", \"threat\", \n",
|
148 |
-
" \"insult\", \"identity_hate\"]]\n",
|
149 |
-
"\n",
|
150 |
-
"# data preprocessing\n",
|
151 |
-
"\n",
|
152 |
-
"\n",
|
153 |
-
"\n",
|
154 |
-
"train_text = train_text.values.tolist()\n",
|
155 |
-
"train_labels = train_labels.values.tolist()\n",
|
156 |
-
"test_text = test_text.values.tolist()\n",
|
157 |
-
"test_labels = test_labels.values.tolist()\n"
|
158 |
-
]
|
159 |
-
},
|
160 |
-
{
|
161 |
-
"cell_type": "code",
|
162 |
-
"execution_count": 10,
|
163 |
-
"id": "1n56TME9Njde",
|
164 |
-
"metadata": {
|
165 |
-
"id": "1n56TME9Njde"
|
166 |
-
},
|
167 |
-
"outputs": [],
|
168 |
-
"source": [
|
169 |
-
"# prepare tokenizer and dataset\n",
|
170 |
-
"\n",
|
171 |
-
"class TweetDataset(Dataset):\n",
|
172 |
-
" def __init__(self, encodings, labels):\n",
|
173 |
-
" self.encodings = encodings\n",
|
174 |
-
" self.labels = labels\n",
|
175 |
-
" self.tok = tokenizer\n",
|
176 |
-
" \n",
|
177 |
-
" def __getitem__(self, idx):\n",
|
178 |
-
"# print(idx)\n",
|
179 |
-
" print(len(self.labels))\n",
|
180 |
-
" encoding = self.tok(self.encodings.strings[idx], truncation=True, padding=\"max_length\", max_length=max_len).to(\"cuda:0\")\n",
|
181 |
-
" print(encoding.items())\n",
|
182 |
-
" item = { key: torch.tensor(val) for key, val in encoding.items() }\n",
|
183 |
-
" item['labels'] = torch.tensor(self.labels[idx])\n",
|
184 |
-
"# print(item)\n",
|
185 |
-
" return item\n",
|
186 |
-
" \n",
|
187 |
-
" def __len__(self):\n",
|
188 |
-
" return len(self.labels)\n",
|
189 |
-
"\n",
|
190 |
-
"# no tokenizer\n",
|
191 |
-
"class TweetDataset2(Dataset):\n",
|
192 |
-
" def __init__(self, encodings, labels):\n",
|
193 |
-
" self.encodings = encodings\n",
|
194 |
-
" self.labels = labels\n",
|
195 |
-
" self.tok = tokenizer\n",
|
196 |
-
" \n",
|
197 |
-
" def __getitem__(self, idx):\n",
|
198 |
-
"# print(idx)\n",
|
199 |
-
" print(len(self.labels))\n",
|
200 |
-
" encoding = self.tok(self.encodings.strings[idx], truncation=True, padding=\"max_length\", max_length=max_len).to(\"cuda:0\")\n",
|
201 |
-
" print(encoding.items())\n",
|
202 |
-
" item = { key: torch.tensor(val) for key, val in encoding.items() }\n",
|
203 |
-
" item['labels'] = torch.tensor(self.labels[idx])\n",
|
204 |
-
"# print(item)\n",
|
205 |
-
" return item\n",
|
206 |
-
" \n",
|
207 |
-
" def __len__(self):\n",
|
208 |
-
" return len(self.labels)\n",
|
209 |
-
"\n",
|
210 |
-
"\n",
|
211 |
-
"\n",
|
212 |
-
"\n",
|
213 |
-
"train_strings = TokenizerDataset(train_text)\n",
|
214 |
-
"test_strings = TokenizerDataset(test_text)\n",
|
215 |
-
"\n",
|
216 |
-
"train_dataloader = DataLoader(train_strings, batch_size=16, shuffle=True)\n",
|
217 |
-
"test_dataloader = DataLoader(test_strings, batch_size=16, shuffle=True)\n",
|
218 |
-
"\n",
|
219 |
-
"\n",
|
220 |
-
"\n",
|
221 |
-
"\n",
|
222 |
-
"train_encodings = tokenizer.batch_encode_plus(train_text, \\\n",
|
223 |
-
" max_length=200, pad_to_max_length=True, \\\n",
|
224 |
-
" truncation=True, return_token_type_ids=False, return_tensors='pt' \\\n",
|
225 |
-
" ).to(\"cuda:0\")\n",
|
226 |
-
"test_encodings = tokenizer.batch_encode_plus(test_text, \\\n",
|
227 |
-
" max_length=200, pad_to_max_length=True, \\\n",
|
228 |
-
" truncation=True, return_token_type_ids=False, return_tensors='pt' \\\n",
|
229 |
-
" ).to(\"cuda:0\")\n",
|
230 |
-
"\n",
|
231 |
-
"# train_encodings = tokenizer(train_text, truncation=True, padding=True)\n",
|
232 |
-
"# test_encodings = tokenizer(test_text, truncation=True, padding=True)"
|
233 |
-
]
|
234 |
-
},
|
235 |
-
{
|
236 |
-
"cell_type": "code",
|
237 |
-
"execution_count": 15,
|
238 |
-
"id": "4kwydz67qjW9",
|
239 |
-
"metadata": {
|
240 |
-
"colab": {
|
241 |
-
"base_uri": "https://localhost:8080/"
|
242 |
-
},
|
243 |
-
"id": "4kwydz67qjW9",
|
244 |
-
"outputId": "1653744e-69cf-46f8-a2d1-ffc3a3a4d58a"
|
245 |
-
},
|
246 |
-
"outputs": [
|
247 |
-
{
|
248 |
-
"name": "stdout",
|
249 |
-
"output_type": "stream",
|
250 |
-
"text": [
|
251 |
-
"159571\n",
|
252 |
-
"159571\n"
|
253 |
-
]
|
254 |
-
}
|
255 |
-
],
|
256 |
-
"source": [
|
257 |
-
"# no tokenizer\n",
|
258 |
-
"class TweetDataset3(Dataset):\n",
|
259 |
-
" def __init__(self, encodings, labels):\n",
|
260 |
-
" self.encodings = encodings\n",
|
261 |
-
" self.labels = labels\n",
|
262 |
-
" self.tok = tokenizer\n",
|
263 |
-
" \n",
|
264 |
-
" def __getitem__(self, idx):\n",
|
265 |
-
" print(idx)\n",
|
266 |
-
" item = { key: torch.tensor(val) for key, val in self.encodings.items() }\n",
|
267 |
-
" item['labels'] = torch.tensor(self.labels[idx])\n",
|
268 |
-
"# print(item)\n",
|
269 |
-
" return item\n",
|
270 |
-
" \n",
|
271 |
-
" def __len__(self):\n",
|
272 |
-
" return len(self.labels)\n",
|
273 |
-
"\n",
|
274 |
-
"\n",
|
275 |
-
"\n",
|
276 |
-
"train_dataset = TweetDataset3(train_encodings, train_labels)\n",
|
277 |
-
"test_dataset = TweetDataset3(test_encodings, test_labels)\n",
|
278 |
-
"\n",
|
279 |
-
"print(len(train_dataset.labels))\n",
|
280 |
-
"print(len(train_strings))\n",
|
281 |
-
"\n",
|
282 |
-
"\n",
|
283 |
-
"class MultilabelTrainer(Trainer):\n",
|
284 |
-
" def compute_loss(self, model, inputs, return_outputs=False):\n",
|
285 |
-
" labels = inputs.pop(\"labels\")\n",
|
286 |
-
" outputs = model(**inputs)\n",
|
287 |
-
" logits = outputs.logits\n",
|
288 |
-
" loss_fct = torch.nn.BCEWithLogitsLoss()\n",
|
289 |
-
" loss = loss_fct(logits.view(-1, self.model.config.num_labels), \n",
|
290 |
-
" labels.float().view(-1, self.model.config.num_labels))\n",
|
291 |
-
" return (loss, outputs) if return_outputs else loss\n",
|
292 |
-
"\n",
|
293 |
-
"\n",
|
294 |
-
"# training\n",
|
295 |
-
"trainer = MultilabelTrainer(\n",
|
296 |
-
" model=model, \n",
|
297 |
-
" args=training_args, \n",
|
298 |
-
" train_dataset=train_dataset, \n",
|
299 |
-
" eval_dataset=test_dataset\n",
|
300 |
-
" )"
|
301 |
-
]
|
302 |
-
},
|
303 |
-
{
|
304 |
-
"cell_type": "code",
|
305 |
-
"execution_count": null,
|
306 |
-
"id": "VwsyMZg_tgTg",
|
307 |
-
"metadata": {
|
308 |
-
"colab": {
|
309 |
-
"base_uri": "https://localhost:8080/",
|
310 |
-
"height": 1000
|
311 |
-
},
|
312 |
-
"id": "VwsyMZg_tgTg",
|
313 |
-
"outputId": "6cf8f3aa-629e-4650-9bbd-dfeb11071ef7"
|
314 |
-
},
|
315 |
-
"outputs": [],
|
316 |
-
"source": [
|
317 |
-
"trainer.train()"
|
318 |
-
]
|
319 |
-
}
|
320 |
-
],
|
321 |
-
"metadata": {
|
322 |
-
"colab": {
|
323 |
-
"provenance": []
|
324 |
-
},
|
325 |
-
"kernelspec": {
|
326 |
-
"display_name": "Python 3 (ipykernel)",
|
327 |
-
"language": "python",
|
328 |
-
"name": "python3"
|
329 |
-
},
|
330 |
-
"language_info": {
|
331 |
-
"codemirror_mode": {
|
332 |
-
"name": "ipython",
|
333 |
-
"version": 3
|
334 |
-
},
|
335 |
-
"file_extension": ".py",
|
336 |
-
"mimetype": "text/x-python",
|
337 |
-
"name": "python",
|
338 |
-
"nbconvert_exporter": "python",
|
339 |
-
"pygments_lexer": "ipython3",
|
340 |
-
"version": "3.10.6"
|
341 |
-
}
|
342 |
-
},
|
343 |
-
"nbformat": 4,
|
344 |
-
"nbformat_minor": 5
|
345 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
logs/1682300361.4426298/events.out.tfevents.1682300361.mint.371280.1
DELETED
Binary file (5.8 kB)
|
|
logs/1682300884.6095285/events.out.tfevents.1682300884.mint.371280.3
DELETED
Binary file (5.8 kB)
|
|
logs/1682300938.1223385/events.out.tfevents.1682300938.mint.371280.5
DELETED
Binary file (5.8 kB)
|
|
logs/1682301013.2686887/events.out.tfevents.1682301013.mint.371280.7
DELETED
Binary file (5.8 kB)
|
|
logs/events.out.tfevents.1682300361.mint.371280.0
DELETED
Binary file (4.19 kB)
|
|
logs/events.out.tfevents.1682300884.mint.371280.2
DELETED
Binary file (4.19 kB)
|
|
logs/events.out.tfevents.1682300938.mint.371280.4
DELETED
Binary file (4.19 kB)
|
|
logs/events.out.tfevents.1682301013.mint.371280.6
DELETED
Binary file (4.19 kB)
|
|
train.py
DELETED
@@ -1,138 +0,0 @@
|
|
1 |
-
import torch
|
2 |
-
from torch.utils.data import Dataset, DataLoader
|
3 |
-
|
4 |
-
import pandas as pd
|
5 |
-
|
6 |
-
from transformers import BertTokenizerFast, BertForSequenceClassification
|
7 |
-
from transformers import Trainer, TrainingArguments
|
8 |
-
|
9 |
-
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
10 |
-
|
11 |
-
model_name = "bert-base-uncased"
|
12 |
-
tokenizer = BertTokenizerFast.from_pretrained(model_name)
|
13 |
-
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=6).to(device)
|
14 |
-
max_len = 200
|
15 |
-
|
16 |
-
training_args = TrainingArguments(
|
17 |
-
output_dir="results",
|
18 |
-
num_train_epochs=1,
|
19 |
-
per_device_train_batch_size=16,
|
20 |
-
per_device_eval_batch_size=64,
|
21 |
-
warmup_steps=500,
|
22 |
-
learning_rate=5e-5,
|
23 |
-
weight_decay=0.01,
|
24 |
-
logging_dir="./logs",
|
25 |
-
logging_steps=10
|
26 |
-
)
|
27 |
-
|
28 |
-
# dataset class that inherits from torch.utils.data.Dataset
|
29 |
-
|
30 |
-
|
31 |
-
class TokenizerDataset(Dataset):
|
32 |
-
def __init__(self, strings):
|
33 |
-
self.strings = strings
|
34 |
-
|
35 |
-
def __getitem__(self, idx):
|
36 |
-
return self.strings[idx]
|
37 |
-
|
38 |
-
def __len__(self):
|
39 |
-
return len(self.strings)
|
40 |
-
|
41 |
-
|
42 |
-
train_data = pd.read_csv("data/train.csv")
|
43 |
-
print(train_data)
|
44 |
-
train_text = train_data["comment_text"]
|
45 |
-
train_labels = train_data[["toxic", "severe_toxic",
|
46 |
-
"obscene", "threat",
|
47 |
-
"insult", "identity_hate"]]
|
48 |
-
|
49 |
-
test_text = pd.read_csv("data/test.csv")["comment_text"]
|
50 |
-
test_labels = pd.read_csv("data/test_labels.csv")[[
|
51 |
-
"toxic", "severe_toxic",
|
52 |
-
"obscene", "threat",
|
53 |
-
"insult", "identity_hate"]]
|
54 |
-
|
55 |
-
# data preprocessing
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
train_text = train_text.values.tolist()
|
60 |
-
train_labels = train_labels.values.tolist()
|
61 |
-
test_text = test_text.values.tolist()
|
62 |
-
test_labels = test_labels.values.tolist()
|
63 |
-
|
64 |
-
|
65 |
-
# prepare tokenizer and dataset
|
66 |
-
|
67 |
-
class TweetDataset(Dataset):
|
68 |
-
def __init__(self, encodings, labels):
|
69 |
-
self.encodings = encodings
|
70 |
-
self.labels = labels
|
71 |
-
self.tok = tokenizer
|
72 |
-
|
73 |
-
def __getitem__(self, idx):
|
74 |
-
print(idx)
|
75 |
-
# print(len(self.labels))
|
76 |
-
encoding = self.tok(self.encodings.strings[idx], truncation=True,
|
77 |
-
padding="max_length", max_length=max_len)
|
78 |
-
# print(encoding.items())
|
79 |
-
item = { key: torch.tensor(val) for key, val in encoding.items() }
|
80 |
-
item['labels'] = torch.tensor(self.labels[idx])
|
81 |
-
# print(item)
|
82 |
-
return item
|
83 |
-
|
84 |
-
def __len__(self):
|
85 |
-
return len(self.labels)
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
train_strings = TokenizerDataset(train_text)
|
92 |
-
test_strings = TokenizerDataset(test_text)
|
93 |
-
|
94 |
-
train_dataloader = DataLoader(train_strings, batch_size=16, shuffle=True)
|
95 |
-
test_dataloader = DataLoader(test_strings, batch_size=16, shuffle=True)
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
# train_encodings = tokenizer.batch_encode_plus(train_text, \
|
101 |
-
# max_length=200, pad_to_max_length=True, \
|
102 |
-
# truncation=True, return_token_type_ids=False \
|
103 |
-
# )
|
104 |
-
# test_encodings = tokenizer.batch_encode_plus(test_text, \
|
105 |
-
# max_length=200, pad_to_max_length=True, \
|
106 |
-
# truncation=True, return_token_type_ids=False \
|
107 |
-
# )
|
108 |
-
|
109 |
-
# train_encodings = tokenizer(train_text, truncation=True, padding=True)
|
110 |
-
# test_encodings = tokenizer(test_text, truncation=True, padding=True)
|
111 |
-
|
112 |
-
train_dataset = TweetDataset(train_strings, train_labels)
|
113 |
-
test_dataset = TweetDataset(test_strings, test_labels)
|
114 |
-
|
115 |
-
print(len(train_dataset.labels))
|
116 |
-
print(len(train_strings))
|
117 |
-
|
118 |
-
|
119 |
-
class MultilabelTrainer(Trainer):
|
120 |
-
def compute_loss(self, model, inputs, return_outputs=False):
|
121 |
-
labels = inputs.pop("labels")
|
122 |
-
outputs = model(**inputs)
|
123 |
-
logits = outputs.logits
|
124 |
-
loss_fct = torch.nn.BCEWithLogitsLoss()
|
125 |
-
loss = loss_fct(logits.view(-1, self.model.config.num_labels),
|
126 |
-
labels.float().view(-1, self.model.config.num_labels))
|
127 |
-
return (loss, outputs) if return_outputs else loss
|
128 |
-
|
129 |
-
|
130 |
-
# training
|
131 |
-
trainer = MultilabelTrainer(
|
132 |
-
model=model,
|
133 |
-
args=training_args,
|
134 |
-
train_dataset=train_dataset,
|
135 |
-
eval_dataset=test_dataset
|
136 |
-
)
|
137 |
-
|
138 |
-
trainer.train()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
training.ipynb
DELETED
@@ -1,164 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"cells": [
|
3 |
-
{
|
4 |
-
"cell_type": "code",
|
5 |
-
"execution_count": 1,
|
6 |
-
"id": "215a1aae",
|
7 |
-
"metadata": {},
|
8 |
-
"outputs": [
|
9 |
-
{
|
10 |
-
"name": "stderr",
|
11 |
-
"output_type": "stream",
|
12 |
-
"text": [
|
13 |
-
"2023-04-23 12:34:45.188102: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
|
14 |
-
"To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
|
15 |
-
"2023-04-23 12:34:45.742757: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
|
16 |
-
]
|
17 |
-
}
|
18 |
-
],
|
19 |
-
"source": [
|
20 |
-
"import torch\n",
|
21 |
-
"from torch.utils.data import Dataset\n",
|
22 |
-
"\n",
|
23 |
-
"import pandas as pd\n",
|
24 |
-
"# import numpy as np\n",
|
25 |
-
"\n",
|
26 |
-
"from transformers import BertTokenizer, BertForSequenceClassification\n",
|
27 |
-
"from transformers import Trainer, TrainingArguments"
|
28 |
-
]
|
29 |
-
},
|
30 |
-
{
|
31 |
-
"cell_type": "code",
|
32 |
-
"execution_count": 10,
|
33 |
-
"id": "9969c58c",
|
34 |
-
"metadata": {
|
35 |
-
"scrolled": false
|
36 |
-
},
|
37 |
-
"outputs": [
|
38 |
-
{
|
39 |
-
"name": "stderr",
|
40 |
-
"output_type": "stream",
|
41 |
-
"text": [
|
42 |
-
"IOPub data rate exceeded.\n",
|
43 |
-
"The notebook server will temporarily stop sending output\n",
|
44 |
-
"to the client in order to avoid crashing it.\n",
|
45 |
-
"To change this limit, set the config variable\n",
|
46 |
-
"`--NotebookApp.iopub_data_rate_limit`.\n",
|
47 |
-
"\n",
|
48 |
-
"Current values:\n",
|
49 |
-
"NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)\n",
|
50 |
-
"NotebookApp.rate_limit_window=3.0 (secs)\n",
|
51 |
-
"\n",
|
52 |
-
"Token indices sequence length is longer than the specified maximum sequence length for this model (631 > 512). Running this sequence through the model will result in indexing errors\n"
|
53 |
-
]
|
54 |
-
},
|
55 |
-
{
|
56 |
-
"ename": "ValueError",
|
57 |
-
"evalue": "text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).",
|
58 |
-
"output_type": "error",
|
59 |
-
"traceback": [
|
60 |
-
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
61 |
-
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
|
62 |
-
"\u001b[0;32m/tmp/ipykernel_325077/677523904.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 38\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 39\u001b[0m \u001b[0mtrain_encodings\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtokenizer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_text\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 40\u001b[0;31m \u001b[0mtest_encodings\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtokenizer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtest_text\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 41\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 42\u001b[0m \u001b[0mtrain_dataset\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mTweetDataset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_encodings\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrain_labels\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
63 |
-
"\u001b[0;32m~/.local/lib/python3.10/site-packages/transformers/tokenization_utils_base.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, text, text_pair, text_target, text_pair_target, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)\u001b[0m\n\u001b[1;32m 2536\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_in_target_context_manager\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2537\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_switch_to_input_mode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2538\u001b[0;31m \u001b[0mencodings\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_call_one\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtext_pair\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtext_pair\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mall_kwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2539\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mtext_target\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2540\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_switch_to_target_mode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
64 |
-
"\u001b[0;32m~/.local/lib/python3.10/site-packages/transformers/tokenization_utils_base.py\u001b[0m in \u001b[0;36m_call_one\u001b[0;34m(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)\u001b[0m\n\u001b[1;32m 2594\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2595\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0m_is_valid_text_input\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2596\u001b[0;31m raise ValueError(\n\u001b[0m\u001b[1;32m 2597\u001b[0m \u001b[0;34m\"text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2598\u001b[0m \u001b[0;34m\"or `List[List[str]]` (batch of pretokenized examples).\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
65 |
-
"\u001b[0;31mValueError\u001b[0m: text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples)."
|
66 |
-
]
|
67 |
-
}
|
68 |
-
],
|
69 |
-
"source": [
|
70 |
-
"model_name = \"bert-base-uncased\"\n",
|
71 |
-
"\n",
|
72 |
-
"# dataset class that inherits from torch.utils.data.Dataset\n",
|
73 |
-
"class TweetDataset(Dataset):\n",
|
74 |
-
" def __init__(self, encodings, labels):\n",
|
75 |
-
" self.encodings = encodings\n",
|
76 |
-
" self.labels = labels\n",
|
77 |
-
" \n",
|
78 |
-
" def __getitem__(self, idx):\n",
|
79 |
-
" item = { key: torch.tensor(val[idx]) for key, val in self.encodings.items() }\n",
|
80 |
-
" item['labels'] = torch.tensor(self.labels[idx])\n",
|
81 |
-
" return item\n",
|
82 |
-
" \n",
|
83 |
-
" def __len__(self):\n",
|
84 |
-
" return len(self.labels)\n",
|
85 |
-
" \n",
|
86 |
-
"\n",
|
87 |
-
"\n",
|
88 |
-
"train_data = pd.read_csv(\"data/train.csv\")\n",
|
89 |
-
"train_text = train_data[\"comment_text\"].values.tolist()\n",
|
90 |
-
"train_labels = train_data[[\"toxic\", \"severe_toxic\", \n",
|
91 |
-
" \"obscene\", \"threat\", \n",
|
92 |
-
" \"insult\", \"identity_hate\"]].values.tolist()\n",
|
93 |
-
"\n",
|
94 |
-
"test_text = pd.read_csv(\"data/test.csv\")[\"comment_text\"].values.tolist()\n",
|
95 |
-
"test_labels = pd.read_csv(\"data/test_labels.csv\")[[\n",
|
96 |
-
" \"toxic\", \"severe_toxic\", \n",
|
97 |
-
" \"obscene\", \"threat\", \n",
|
98 |
-
" \"insult\", \"identity_hate\"]].values.tolist()\n",
|
99 |
-
"\n",
|
100 |
-
"\n",
|
101 |
-
"# prepare tokenizer and dataset\n",
|
102 |
-
"\n",
|
103 |
-
"tokenizer = BertTokenizer.from_pretrained(model_name)\n",
|
104 |
-
"\n",
|
105 |
-
"print(train_text)\n",
|
106 |
-
"\n",
|
107 |
-
"\n",
|
108 |
-
"train_encodings = tokenizer(train_text)\n",
|
109 |
-
"test_encodings = tokenizer(test_text)\n",
|
110 |
-
"\n",
|
111 |
-
"train_dataset = TweetDataset(train_encodings, train_labels)\n",
|
112 |
-
"test_dataset = TweetDataset(test_encodings, test_labels)\n",
|
113 |
-
"\n",
|
114 |
-
"\n",
|
115 |
-
"# training\n",
|
116 |
-
"\n",
|
117 |
-
"\n",
|
118 |
-
"training_args = TrainingArguments(\n",
|
119 |
-
" output_dir=\"results\",\n",
|
120 |
-
" num_train_epochs=2,\n",
|
121 |
-
" per_device_train_batch_size=16,\n",
|
122 |
-
" per_device_eval_barch_size=64,\n",
|
123 |
-
" warmup_steps=500,\n",
|
124 |
-
" learning_rate=5e-5,\n",
|
125 |
-
" weight_decay=0.01,\n",
|
126 |
-
" logging_dir=\"./logs\",\n",
|
127 |
-
" logging_steps=10\n",
|
128 |
-
" )\n",
|
129 |
-
"\n",
|
130 |
-
"model = BertForSequenceClassification.from_pretrained(model_name, num_labels=6)\n",
|
131 |
-
"\n",
|
132 |
-
"\n",
|
133 |
-
"trainer = Trainer(\n",
|
134 |
-
" model=model, \n",
|
135 |
-
" args=args, \n",
|
136 |
-
" train_dataset=train_dataset, \n",
|
137 |
-
" val_dataset=test_dataset)\n",
|
138 |
-
"\n",
|
139 |
-
"trainer.train()\n"
|
140 |
-
]
|
141 |
-
}
|
142 |
-
],
|
143 |
-
"metadata": {
|
144 |
-
"kernelspec": {
|
145 |
-
"display_name": "Python 3 (ipykernel)",
|
146 |
-
"language": "python",
|
147 |
-
"name": "python3"
|
148 |
-
},
|
149 |
-
"language_info": {
|
150 |
-
"codemirror_mode": {
|
151 |
-
"name": "ipython",
|
152 |
-
"version": 3
|
153 |
-
},
|
154 |
-
"file_extension": ".py",
|
155 |
-
"mimetype": "text/x-python",
|
156 |
-
"name": "python",
|
157 |
-
"nbconvert_exporter": "python",
|
158 |
-
"pygments_lexer": "ipython3",
|
159 |
-
"version": "3.10.6"
|
160 |
-
}
|
161 |
-
},
|
162 |
-
"nbformat": 4,
|
163 |
-
"nbformat_minor": 5
|
164 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
traintokens.txt
DELETED
File without changes
|
working_training.ipynb
CHANGED
The diff for this file is too large to render.
See raw diff
|
|