oskarandrsson commited on
Commit
9a9fd01
·
1 Parent(s): 5983545

Upload 9 files

Browse files
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:483c11f2f7859f42f43940df7ded7b3713b46c6ebf41e3d23048f987bd68d3d1
3
+ size 14575
scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:547e4ccf154b7229f6b0a3a28ce9a9ab912111a947498b98b93ffe9a563a9669
3
+ size 557
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b88405c57dbc51e9dec4f7ee277c9d9c2595d1c04838590704924d2e7b081463
3
+ size 627
source.spm ADDED
Binary file (790 kB). View file
 
special_tokens_map.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "eos_token": "</s>",
3
+ "pad_token": "<pad>",
4
+ "unk_token": "<unk>"
5
+ }
target.spm ADDED
Binary file (815 kB). View file
 
tokenizer_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eos_token": "</s>",
3
+ "model_max_length": 512,
4
+ "name_or_path": "Helsinki-NLP/opus-mt-en-sv",
5
+ "pad_token": "<pad>",
6
+ "separate_vocabs": false,
7
+ "source_lang": "en",
8
+ "sp_model_kwargs": {},
9
+ "special_tokens_map_file": null,
10
+ "target_lang": "sv",
11
+ "tokenizer_class": "MarianTokenizer",
12
+ "unk_token": "<unk>"
13
+ }
trainer_state.json ADDED
@@ -0,0 +1,544 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 8.998342410608572,
5
+ "global_step": 38000,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.12,
12
+ "learning_rate": 4.940800378877576e-06,
13
+ "loss": 0.9254,
14
+ "step": 500
15
+ },
16
+ {
17
+ "epoch": 0.24,
18
+ "learning_rate": 4.88160075775515e-06,
19
+ "loss": 0.8688,
20
+ "step": 1000
21
+ },
22
+ {
23
+ "epoch": 0.36,
24
+ "learning_rate": 4.8224011366327265e-06,
25
+ "loss": 0.8181,
26
+ "step": 1500
27
+ },
28
+ {
29
+ "epoch": 0.47,
30
+ "learning_rate": 4.763201515510301e-06,
31
+ "loss": 0.8026,
32
+ "step": 2000
33
+ },
34
+ {
35
+ "epoch": 0.59,
36
+ "learning_rate": 4.704120293630121e-06,
37
+ "loss": 0.7869,
38
+ "step": 2500
39
+ },
40
+ {
41
+ "epoch": 0.71,
42
+ "learning_rate": 4.644920672507696e-06,
43
+ "loss": 0.7747,
44
+ "step": 3000
45
+ },
46
+ {
47
+ "epoch": 0.83,
48
+ "learning_rate": 4.5857210513852715e-06,
49
+ "loss": 0.7666,
50
+ "step": 3500
51
+ },
52
+ {
53
+ "epoch": 0.95,
54
+ "learning_rate": 4.526639829505092e-06,
55
+ "loss": 0.751,
56
+ "step": 4000
57
+ },
58
+ {
59
+ "epoch": 1.0,
60
+ "eval_bleu": 60.550102386870705,
61
+ "eval_loss": 0.6660878658294678,
62
+ "eval_runtime": 642.2167,
63
+ "eval_samples_per_second": 17.536,
64
+ "eval_steps_per_second": 4.385,
65
+ "step": 4223
66
+ },
67
+ {
68
+ "epoch": 1.07,
69
+ "learning_rate": 4.4674402083826666e-06,
70
+ "loss": 0.7241,
71
+ "step": 4500
72
+ },
73
+ {
74
+ "epoch": 1.18,
75
+ "learning_rate": 4.408240587260242e-06,
76
+ "loss": 0.7178,
77
+ "step": 5000
78
+ },
79
+ {
80
+ "epoch": 1.3,
81
+ "learning_rate": 4.349040966137817e-06,
82
+ "loss": 0.7086,
83
+ "step": 5500
84
+ },
85
+ {
86
+ "epoch": 1.42,
87
+ "learning_rate": 4.289841345015393e-06,
88
+ "loss": 0.7029,
89
+ "step": 6000
90
+ },
91
+ {
92
+ "epoch": 1.54,
93
+ "learning_rate": 4.230641723892967e-06,
94
+ "loss": 0.7171,
95
+ "step": 6500
96
+ },
97
+ {
98
+ "epoch": 1.66,
99
+ "learning_rate": 4.1714421027705425e-06,
100
+ "loss": 0.6904,
101
+ "step": 7000
102
+ },
103
+ {
104
+ "epoch": 1.78,
105
+ "learning_rate": 4.112242481648118e-06,
106
+ "loss": 0.7011,
107
+ "step": 7500
108
+ },
109
+ {
110
+ "epoch": 1.89,
111
+ "learning_rate": 4.053042860525692e-06,
112
+ "loss": 0.6929,
113
+ "step": 8000
114
+ },
115
+ {
116
+ "epoch": 2.0,
117
+ "eval_bleu": 61.03917142412645,
118
+ "eval_loss": 0.6421462893486023,
119
+ "eval_runtime": 642.6358,
120
+ "eval_samples_per_second": 17.525,
121
+ "eval_steps_per_second": 4.382,
122
+ "step": 8446
123
+ },
124
+ {
125
+ "epoch": 2.01,
126
+ "learning_rate": 3.993961638645513e-06,
127
+ "loss": 0.6995,
128
+ "step": 8500
129
+ },
130
+ {
131
+ "epoch": 2.13,
132
+ "learning_rate": 3.934762017523088e-06,
133
+ "loss": 0.6573,
134
+ "step": 9000
135
+ },
136
+ {
137
+ "epoch": 2.25,
138
+ "learning_rate": 3.875680795642908e-06,
139
+ "loss": 0.6592,
140
+ "step": 9500
141
+ },
142
+ {
143
+ "epoch": 2.37,
144
+ "learning_rate": 3.8164811745204834e-06,
145
+ "loss": 0.6658,
146
+ "step": 10000
147
+ },
148
+ {
149
+ "epoch": 2.49,
150
+ "learning_rate": 3.7572815533980584e-06,
151
+ "loss": 0.6613,
152
+ "step": 10500
153
+ },
154
+ {
155
+ "epoch": 2.6,
156
+ "learning_rate": 3.6980819322756337e-06,
157
+ "loss": 0.6553,
158
+ "step": 11000
159
+ },
160
+ {
161
+ "epoch": 2.72,
162
+ "learning_rate": 3.639000710395454e-06,
163
+ "loss": 0.6497,
164
+ "step": 11500
165
+ },
166
+ {
167
+ "epoch": 2.84,
168
+ "learning_rate": 3.579801089273029e-06,
169
+ "loss": 0.6513,
170
+ "step": 12000
171
+ },
172
+ {
173
+ "epoch": 2.96,
174
+ "learning_rate": 3.520601468150604e-06,
175
+ "loss": 0.6596,
176
+ "step": 12500
177
+ },
178
+ {
179
+ "epoch": 3.0,
180
+ "eval_bleu": 61.322127725698586,
181
+ "eval_loss": 0.630657434463501,
182
+ "eval_runtime": 654.7731,
183
+ "eval_samples_per_second": 17.2,
184
+ "eval_steps_per_second": 4.301,
185
+ "step": 12669
186
+ },
187
+ {
188
+ "epoch": 3.08,
189
+ "learning_rate": 3.461401847028179e-06,
190
+ "loss": 0.6272,
191
+ "step": 13000
192
+ },
193
+ {
194
+ "epoch": 3.2,
195
+ "learning_rate": 3.4022022259057545e-06,
196
+ "loss": 0.6378,
197
+ "step": 13500
198
+ },
199
+ {
200
+ "epoch": 3.32,
201
+ "learning_rate": 3.34300260478333e-06,
202
+ "loss": 0.6432,
203
+ "step": 14000
204
+ },
205
+ {
206
+ "epoch": 3.43,
207
+ "learning_rate": 3.283802983660905e-06,
208
+ "loss": 0.6345,
209
+ "step": 14500
210
+ },
211
+ {
212
+ "epoch": 3.55,
213
+ "learning_rate": 3.2246033625384797e-06,
214
+ "loss": 0.6259,
215
+ "step": 15000
216
+ },
217
+ {
218
+ "epoch": 3.67,
219
+ "learning_rate": 3.1654037414160555e-06,
220
+ "loss": 0.6261,
221
+ "step": 15500
222
+ },
223
+ {
224
+ "epoch": 3.79,
225
+ "learning_rate": 3.1062041202936305e-06,
226
+ "loss": 0.6176,
227
+ "step": 16000
228
+ },
229
+ {
230
+ "epoch": 3.91,
231
+ "learning_rate": 3.0471228984134506e-06,
232
+ "loss": 0.6128,
233
+ "step": 16500
234
+ },
235
+ {
236
+ "epoch": 4.0,
237
+ "eval_bleu": 61.4472635817894,
238
+ "eval_loss": 0.6250163912773132,
239
+ "eval_runtime": 654.825,
240
+ "eval_samples_per_second": 17.198,
241
+ "eval_steps_per_second": 4.3,
242
+ "step": 16892
243
+ },
244
+ {
245
+ "epoch": 4.03,
246
+ "learning_rate": 2.9879232772910256e-06,
247
+ "loss": 0.6061,
248
+ "step": 17000
249
+ },
250
+ {
251
+ "epoch": 4.14,
252
+ "learning_rate": 2.9287236561686005e-06,
253
+ "loss": 0.6037,
254
+ "step": 17500
255
+ },
256
+ {
257
+ "epoch": 4.26,
258
+ "learning_rate": 2.8695240350461763e-06,
259
+ "loss": 0.6056,
260
+ "step": 18000
261
+ },
262
+ {
263
+ "epoch": 4.38,
264
+ "learning_rate": 2.810442813165996e-06,
265
+ "loss": 0.6008,
266
+ "step": 18500
267
+ },
268
+ {
269
+ "epoch": 4.5,
270
+ "learning_rate": 2.751243192043571e-06,
271
+ "loss": 0.6034,
272
+ "step": 19000
273
+ },
274
+ {
275
+ "epoch": 4.62,
276
+ "learning_rate": 2.6920435709211463e-06,
277
+ "loss": 0.5962,
278
+ "step": 19500
279
+ },
280
+ {
281
+ "epoch": 4.74,
282
+ "learning_rate": 2.6328439497987212e-06,
283
+ "loss": 0.5997,
284
+ "step": 20000
285
+ },
286
+ {
287
+ "epoch": 4.85,
288
+ "learning_rate": 2.573644328676297e-06,
289
+ "loss": 0.6001,
290
+ "step": 20500
291
+ },
292
+ {
293
+ "epoch": 4.97,
294
+ "learning_rate": 2.5145631067961168e-06,
295
+ "loss": 0.5937,
296
+ "step": 21000
297
+ },
298
+ {
299
+ "epoch": 5.0,
300
+ "eval_bleu": 61.68307480515254,
301
+ "eval_loss": 0.6184483766555786,
302
+ "eval_runtime": 657.8509,
303
+ "eval_samples_per_second": 17.119,
304
+ "eval_steps_per_second": 4.281,
305
+ "step": 21115
306
+ },
307
+ {
308
+ "epoch": 5.09,
309
+ "learning_rate": 2.4553634856736917e-06,
310
+ "loss": 0.5837,
311
+ "step": 21500
312
+ },
313
+ {
314
+ "epoch": 5.21,
315
+ "learning_rate": 2.396163864551267e-06,
316
+ "loss": 0.5807,
317
+ "step": 22000
318
+ },
319
+ {
320
+ "epoch": 5.33,
321
+ "learning_rate": 2.3369642434288424e-06,
322
+ "loss": 0.5835,
323
+ "step": 22500
324
+ },
325
+ {
326
+ "epoch": 5.45,
327
+ "learning_rate": 2.277883021548662e-06,
328
+ "loss": 0.5901,
329
+ "step": 23000
330
+ },
331
+ {
332
+ "epoch": 5.56,
333
+ "learning_rate": 2.2186834004262375e-06,
334
+ "loss": 0.5794,
335
+ "step": 23500
336
+ },
337
+ {
338
+ "epoch": 5.68,
339
+ "learning_rate": 2.1596021785460573e-06,
340
+ "loss": 0.5814,
341
+ "step": 24000
342
+ },
343
+ {
344
+ "epoch": 5.8,
345
+ "learning_rate": 2.1004025574236326e-06,
346
+ "loss": 0.5874,
347
+ "step": 24500
348
+ },
349
+ {
350
+ "epoch": 5.92,
351
+ "learning_rate": 2.041202936301208e-06,
352
+ "loss": 0.5852,
353
+ "step": 25000
354
+ },
355
+ {
356
+ "epoch": 6.0,
357
+ "eval_bleu": 61.62760640999204,
358
+ "eval_loss": 0.6162133812904358,
359
+ "eval_runtime": 658.1016,
360
+ "eval_samples_per_second": 17.113,
361
+ "eval_steps_per_second": 4.279,
362
+ "step": 25338
363
+ },
364
+ {
365
+ "epoch": 6.04,
366
+ "learning_rate": 1.982003315178783e-06,
367
+ "loss": 0.5692,
368
+ "step": 25500
369
+ },
370
+ {
371
+ "epoch": 6.16,
372
+ "learning_rate": 1.9228036940563583e-06,
373
+ "loss": 0.5773,
374
+ "step": 26000
375
+ },
376
+ {
377
+ "epoch": 6.28,
378
+ "learning_rate": 1.8636040729339332e-06,
379
+ "loss": 0.5652,
380
+ "step": 26500
381
+ },
382
+ {
383
+ "epoch": 6.39,
384
+ "learning_rate": 1.8044044518115086e-06,
385
+ "loss": 0.5635,
386
+ "step": 27000
387
+ },
388
+ {
389
+ "epoch": 6.51,
390
+ "learning_rate": 1.7452048306890837e-06,
391
+ "loss": 0.5576,
392
+ "step": 27500
393
+ },
394
+ {
395
+ "epoch": 6.63,
396
+ "learning_rate": 1.686005209566659e-06,
397
+ "loss": 0.5811,
398
+ "step": 28000
399
+ },
400
+ {
401
+ "epoch": 6.75,
402
+ "learning_rate": 1.626923987686479e-06,
403
+ "loss": 0.5702,
404
+ "step": 28500
405
+ },
406
+ {
407
+ "epoch": 6.87,
408
+ "learning_rate": 1.567724366564054e-06,
409
+ "loss": 0.5691,
410
+ "step": 29000
411
+ },
412
+ {
413
+ "epoch": 6.99,
414
+ "learning_rate": 1.5085247454416293e-06,
415
+ "loss": 0.5768,
416
+ "step": 29500
417
+ },
418
+ {
419
+ "epoch": 7.0,
420
+ "eval_bleu": 61.61706518334981,
421
+ "eval_loss": 0.6142455339431763,
422
+ "eval_runtime": 648.5567,
423
+ "eval_samples_per_second": 17.365,
424
+ "eval_steps_per_second": 4.342,
425
+ "step": 29561
426
+ },
427
+ {
428
+ "epoch": 7.1,
429
+ "learning_rate": 1.4493251243192045e-06,
430
+ "loss": 0.5516,
431
+ "step": 30000
432
+ },
433
+ {
434
+ "epoch": 7.22,
435
+ "learning_rate": 1.3902439024390246e-06,
436
+ "loss": 0.5561,
437
+ "step": 30500
438
+ },
439
+ {
440
+ "epoch": 7.34,
441
+ "learning_rate": 1.3310442813165996e-06,
442
+ "loss": 0.5586,
443
+ "step": 31000
444
+ },
445
+ {
446
+ "epoch": 7.46,
447
+ "learning_rate": 1.2718446601941747e-06,
448
+ "loss": 0.5598,
449
+ "step": 31500
450
+ },
451
+ {
452
+ "epoch": 7.58,
453
+ "learning_rate": 1.212763438313995e-06,
454
+ "loss": 0.5591,
455
+ "step": 32000
456
+ },
457
+ {
458
+ "epoch": 7.7,
459
+ "learning_rate": 1.15356381719157e-06,
460
+ "loss": 0.563,
461
+ "step": 32500
462
+ },
463
+ {
464
+ "epoch": 7.81,
465
+ "learning_rate": 1.0943641960691452e-06,
466
+ "loss": 0.5492,
467
+ "step": 33000
468
+ },
469
+ {
470
+ "epoch": 7.93,
471
+ "learning_rate": 1.0351645749467203e-06,
472
+ "loss": 0.555,
473
+ "step": 33500
474
+ },
475
+ {
476
+ "epoch": 8.0,
477
+ "eval_bleu": 61.82024377811232,
478
+ "eval_loss": 0.6130707859992981,
479
+ "eval_runtime": 628.3785,
480
+ "eval_samples_per_second": 17.922,
481
+ "eval_steps_per_second": 4.481,
482
+ "step": 33784
483
+ },
484
+ {
485
+ "epoch": 8.05,
486
+ "learning_rate": 9.759649538242955e-07,
487
+ "loss": 0.5637,
488
+ "step": 34000
489
+ },
490
+ {
491
+ "epoch": 8.17,
492
+ "learning_rate": 9.167653327018707e-07,
493
+ "loss": 0.5569,
494
+ "step": 34500
495
+ },
496
+ {
497
+ "epoch": 8.29,
498
+ "learning_rate": 8.576841108216909e-07,
499
+ "loss": 0.5456,
500
+ "step": 35000
501
+ },
502
+ {
503
+ "epoch": 8.41,
504
+ "learning_rate": 7.98484489699266e-07,
505
+ "loss": 0.5515,
506
+ "step": 35500
507
+ },
508
+ {
509
+ "epoch": 8.52,
510
+ "learning_rate": 7.392848685768411e-07,
511
+ "loss": 0.5521,
512
+ "step": 36000
513
+ },
514
+ {
515
+ "epoch": 8.64,
516
+ "learning_rate": 6.800852474544164e-07,
517
+ "loss": 0.5382,
518
+ "step": 36500
519
+ },
520
+ {
521
+ "epoch": 8.76,
522
+ "learning_rate": 6.208856263319915e-07,
523
+ "loss": 0.5494,
524
+ "step": 37000
525
+ },
526
+ {
527
+ "epoch": 8.88,
528
+ "learning_rate": 5.618044044518116e-07,
529
+ "loss": 0.5504,
530
+ "step": 37500
531
+ },
532
+ {
533
+ "epoch": 9.0,
534
+ "learning_rate": 5.026047833293867e-07,
535
+ "loss": 0.5516,
536
+ "step": 38000
537
+ }
538
+ ],
539
+ "max_steps": 42230,
540
+ "num_train_epochs": 10,
541
+ "total_flos": 1.95716976869376e+16,
542
+ "trial_name": null,
543
+ "trial_params": null
544
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff