hbXNov commited on
Commit
22bab6b
1 Parent(s): e711763

Delete owl-con

Browse files
owl-con/README.md DELETED
@@ -1,6 +0,0 @@
1
- ## Training procedure
2
-
3
- ### Framework versions
4
-
5
-
6
- - PEFT 0.5.0
 
 
 
 
 
 
 
owl-con/adapter_config.json DELETED
@@ -1,21 +0,0 @@
1
- {
2
- "auto_mapping": {
3
- "base_model_class": "MplugOwlForConditionalGeneration",
4
- "parent_library": "mplug_owl_video.modeling_mplug_owl"
5
- },
6
- "base_model_name_or_path": "/local2/hbansal/video_text/mplug-owl-llama-7b-video",
7
- "bias": "none",
8
- "fan_in_fan_out": false,
9
- "inference_mode": true,
10
- "init_lora_weights": true,
11
- "layers_pattern": null,
12
- "layers_to_transform": null,
13
- "lora_alpha": 32,
14
- "lora_dropout": 0.05,
15
- "modules_to_save": null,
16
- "peft_type": "LORA",
17
- "r": 32,
18
- "revision": null,
19
- "target_modules": ".*language_model.*\\.(q_proj|v_proj|k_proj|o_proj|gate_proj|down_proj|up_proj)",
20
- "task_type": null
21
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
owl-con/adapter_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:24418f746189910a0cda8440616b8b30069dd2dd3198bdac96e5f9c73cc7194c
3
- size 319983949
 
 
 
 
owl-con/checkpoint-5178/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:cf1a86302d333ad1cafaad3eb7cc16919a84cbc3625da0671cf679e9ddcc243e
3
- size 639897797
 
 
 
 
owl-con/checkpoint-5178/pytorch_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:1f4072beeee705a35d216a3766e51db7c650e072db92fb18804b37c4fc66ed33
3
- size 14625892902
 
 
 
 
owl-con/checkpoint-5178/rng_state_0.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3600cac96a5f6090f0388e68ce5154e2499b0925fd9a4f9acbf92e14efd041a7
3
- size 14583
 
 
 
 
owl-con/checkpoint-5178/rng_state_1.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c93cf10ed6fd3c0e772b9b7dfe2a4c963feec3edfe935f8b26221ed13ce8ce3f
3
- size 14583
 
 
 
 
owl-con/checkpoint-5178/rng_state_2.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:84de293d1b76db69337a844e9fcbb7c3d02cc31f298f66012e021957e93fab45
3
- size 14583
 
 
 
 
owl-con/checkpoint-5178/rng_state_3.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:dcba752ae8c97fd37f217e2e833bd34841568dd8e736506e4b43e031f3286ffe
3
- size 14583
 
 
 
 
owl-con/checkpoint-5178/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:4364ab5259a0621078e4e8636f22de7cb70a3dda6e1270d6764c760c58aed389
3
- size 627
 
 
 
 
owl-con/checkpoint-5178/special_tokens_map.json DELETED
@@ -1,30 +0,0 @@
1
- {
2
- "bos_token": {
3
- "content": "<s>",
4
- "lstrip": false,
5
- "normalized": true,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "eos_token": {
10
- "content": "</s>",
11
- "lstrip": false,
12
- "normalized": true,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
- "pad_token": {
17
- "content": "<unk>",
18
- "lstrip": false,
19
- "normalized": true,
20
- "rstrip": false,
21
- "single_word": false
22
- },
23
- "unk_token": {
24
- "content": "<unk>",
25
- "lstrip": false,
26
- "normalized": true,
27
- "rstrip": false,
28
- "single_word": false
29
- }
30
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
owl-con/checkpoint-5178/tokenizer.model DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
- size 499723
 
 
 
 
owl-con/checkpoint-5178/tokenizer_config.json DELETED
@@ -1,40 +0,0 @@
1
- {
2
- "add_bos_token": true,
3
- "add_eos_token": false,
4
- "bos_token": {
5
- "__type": "AddedToken",
6
- "content": "<s>",
7
- "lstrip": false,
8
- "normalized": true,
9
- "rstrip": false,
10
- "single_word": false
11
- },
12
- "clean_up_tokenization_spaces": false,
13
- "eos_token": {
14
- "__type": "AddedToken",
15
- "content": "</s>",
16
- "lstrip": false,
17
- "normalized": true,
18
- "rstrip": false,
19
- "single_word": false
20
- },
21
- "model_max_length": 1000000000000000019884624838656,
22
- "pad_token": {
23
- "__type": "AddedToken",
24
- "content": "<unk>",
25
- "lstrip": false,
26
- "normalized": true,
27
- "rstrip": false,
28
- "single_word": false
29
- },
30
- "sp_model_kwargs": {},
31
- "tokenizer_class": "LlamaTokenizer",
32
- "unk_token": {
33
- "__type": "AddedToken",
34
- "content": "<unk>",
35
- "lstrip": false,
36
- "normalized": true,
37
- "rstrip": false,
38
- "single_word": false
39
- }
40
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
owl-con/checkpoint-5178/trainer_state.json DELETED
@@ -1,3526 +0,0 @@
1
- {
2
- "best_metric": null,
3
- "best_model_checkpoint": null,
4
- "epoch": 2.0,
5
- "global_step": 5178,
6
- "is_hyper_param_search": false,
7
- "is_local_process_zero": true,
8
- "is_world_process_zero": true,
9
- "log_history": [
10
- {
11
- "epoch": 0.0,
12
- "learning_rate": 5e-06,
13
- "loss": 3.0388,
14
- "step": 10
15
- },
16
- {
17
- "epoch": 0.01,
18
- "learning_rate": 1e-05,
19
- "loss": 2.9144,
20
- "step": 20
21
- },
22
- {
23
- "epoch": 0.01,
24
- "learning_rate": 1.5e-05,
25
- "loss": 2.4056,
26
- "step": 30
27
- },
28
- {
29
- "epoch": 0.02,
30
- "learning_rate": 2e-05,
31
- "loss": 1.3107,
32
- "step": 40
33
- },
34
- {
35
- "epoch": 0.02,
36
- "learning_rate": 2.5e-05,
37
- "loss": 0.8564,
38
- "step": 50
39
- },
40
- {
41
- "epoch": 0.02,
42
- "learning_rate": 3e-05,
43
- "loss": 0.6775,
44
- "step": 60
45
- },
46
- {
47
- "epoch": 0.03,
48
- "learning_rate": 3.5e-05,
49
- "loss": 0.54,
50
- "step": 70
51
- },
52
- {
53
- "epoch": 0.03,
54
- "learning_rate": 4e-05,
55
- "loss": 0.5361,
56
- "step": 80
57
- },
58
- {
59
- "epoch": 0.03,
60
- "learning_rate": 4.5e-05,
61
- "loss": 0.4847,
62
- "step": 90
63
- },
64
- {
65
- "epoch": 0.04,
66
- "learning_rate": 5e-05,
67
- "loss": 0.4707,
68
- "step": 100
69
- },
70
- {
71
- "epoch": 0.04,
72
- "eval_loss": 0.4903041422367096,
73
- "eval_runtime": 241.1648,
74
- "eval_samples_per_second": 2.073,
75
- "eval_steps_per_second": 0.066,
76
- "step": 100
77
- },
78
- {
79
- "epoch": 0.04,
80
- "learning_rate": 5.500000000000001e-05,
81
- "loss": 0.4843,
82
- "step": 110
83
- },
84
- {
85
- "epoch": 0.05,
86
- "learning_rate": 6e-05,
87
- "loss": 0.4912,
88
- "step": 120
89
- },
90
- {
91
- "epoch": 0.05,
92
- "learning_rate": 6.500000000000001e-05,
93
- "loss": 0.453,
94
- "step": 130
95
- },
96
- {
97
- "epoch": 0.05,
98
- "learning_rate": 7e-05,
99
- "loss": 0.4396,
100
- "step": 140
101
- },
102
- {
103
- "epoch": 0.06,
104
- "learning_rate": 7.500000000000001e-05,
105
- "loss": 0.4066,
106
- "step": 150
107
- },
108
- {
109
- "epoch": 0.06,
110
- "learning_rate": 8e-05,
111
- "loss": 0.4339,
112
- "step": 160
113
- },
114
- {
115
- "epoch": 0.07,
116
- "learning_rate": 8.5e-05,
117
- "loss": 0.4042,
118
- "step": 170
119
- },
120
- {
121
- "epoch": 0.07,
122
- "learning_rate": 9e-05,
123
- "loss": 0.4036,
124
- "step": 180
125
- },
126
- {
127
- "epoch": 0.07,
128
- "learning_rate": 9.5e-05,
129
- "loss": 0.4017,
130
- "step": 190
131
- },
132
- {
133
- "epoch": 0.08,
134
- "learning_rate": 0.0001,
135
- "loss": 0.4132,
136
- "step": 200
137
- },
138
- {
139
- "epoch": 0.08,
140
- "eval_loss": 0.4102957546710968,
141
- "eval_runtime": 237.7984,
142
- "eval_samples_per_second": 2.103,
143
- "eval_steps_per_second": 0.067,
144
- "step": 200
145
- },
146
- {
147
- "epoch": 0.08,
148
- "learning_rate": 9.979911611088791e-05,
149
- "loss": 0.4073,
150
- "step": 210
151
- },
152
- {
153
- "epoch": 0.08,
154
- "learning_rate": 9.959823222177582e-05,
155
- "loss": 0.3744,
156
- "step": 220
157
- },
158
- {
159
- "epoch": 0.09,
160
- "learning_rate": 9.939734833266372e-05,
161
- "loss": 0.4091,
162
- "step": 230
163
- },
164
- {
165
- "epoch": 0.09,
166
- "learning_rate": 9.919646444355163e-05,
167
- "loss": 0.3846,
168
- "step": 240
169
- },
170
- {
171
- "epoch": 0.1,
172
- "learning_rate": 9.899558055443953e-05,
173
- "loss": 0.4056,
174
- "step": 250
175
- },
176
- {
177
- "epoch": 0.1,
178
- "learning_rate": 9.879469666532744e-05,
179
- "loss": 0.3713,
180
- "step": 260
181
- },
182
- {
183
- "epoch": 0.1,
184
- "learning_rate": 9.859381277621536e-05,
185
- "loss": 0.3851,
186
- "step": 270
187
- },
188
- {
189
- "epoch": 0.11,
190
- "learning_rate": 9.839292888710326e-05,
191
- "loss": 0.3802,
192
- "step": 280
193
- },
194
- {
195
- "epoch": 0.11,
196
- "learning_rate": 9.819204499799116e-05,
197
- "loss": 0.3883,
198
- "step": 290
199
- },
200
- {
201
- "epoch": 0.12,
202
- "learning_rate": 9.799116110887908e-05,
203
- "loss": 0.3939,
204
- "step": 300
205
- },
206
- {
207
- "epoch": 0.12,
208
- "eval_loss": 0.3881138265132904,
209
- "eval_runtime": 233.0966,
210
- "eval_samples_per_second": 2.145,
211
- "eval_steps_per_second": 0.069,
212
- "step": 300
213
- },
214
- {
215
- "epoch": 0.12,
216
- "learning_rate": 9.779027721976698e-05,
217
- "loss": 0.3892,
218
- "step": 310
219
- },
220
- {
221
- "epoch": 0.12,
222
- "learning_rate": 9.758939333065489e-05,
223
- "loss": 0.3568,
224
- "step": 320
225
- },
226
- {
227
- "epoch": 0.13,
228
- "learning_rate": 9.73885094415428e-05,
229
- "loss": 0.3924,
230
- "step": 330
231
- },
232
- {
233
- "epoch": 0.13,
234
- "learning_rate": 9.71876255524307e-05,
235
- "loss": 0.3665,
236
- "step": 340
237
- },
238
- {
239
- "epoch": 0.14,
240
- "learning_rate": 9.69867416633186e-05,
241
- "loss": 0.3685,
242
- "step": 350
243
- },
244
- {
245
- "epoch": 0.14,
246
- "learning_rate": 9.678585777420651e-05,
247
- "loss": 0.3713,
248
- "step": 360
249
- },
250
- {
251
- "epoch": 0.14,
252
- "learning_rate": 9.658497388509442e-05,
253
- "loss": 0.356,
254
- "step": 370
255
- },
256
- {
257
- "epoch": 0.15,
258
- "learning_rate": 9.638408999598232e-05,
259
- "loss": 0.3484,
260
- "step": 380
261
- },
262
- {
263
- "epoch": 0.15,
264
- "learning_rate": 9.618320610687024e-05,
265
- "loss": 0.3865,
266
- "step": 390
267
- },
268
- {
269
- "epoch": 0.15,
270
- "learning_rate": 9.598232221775813e-05,
271
- "loss": 0.3645,
272
- "step": 400
273
- },
274
- {
275
- "epoch": 0.15,
276
- "eval_loss": 0.3724190890789032,
277
- "eval_runtime": 233.3567,
278
- "eval_samples_per_second": 2.143,
279
- "eval_steps_per_second": 0.069,
280
- "step": 400
281
- },
282
- {
283
- "epoch": 0.16,
284
- "learning_rate": 9.578143832864604e-05,
285
- "loss": 0.3665,
286
- "step": 410
287
- },
288
- {
289
- "epoch": 0.16,
290
- "learning_rate": 9.558055443953396e-05,
291
- "loss": 0.3496,
292
- "step": 420
293
- },
294
- {
295
- "epoch": 0.17,
296
- "learning_rate": 9.537967055042187e-05,
297
- "loss": 0.332,
298
- "step": 430
299
- },
300
- {
301
- "epoch": 0.17,
302
- "learning_rate": 9.517878666130977e-05,
303
- "loss": 0.3624,
304
- "step": 440
305
- },
306
- {
307
- "epoch": 0.17,
308
- "learning_rate": 9.497790277219766e-05,
309
- "loss": 0.3503,
310
- "step": 450
311
- },
312
- {
313
- "epoch": 0.18,
314
- "learning_rate": 9.477701888308558e-05,
315
- "loss": 0.337,
316
- "step": 460
317
- },
318
- {
319
- "epoch": 0.18,
320
- "learning_rate": 9.457613499397349e-05,
321
- "loss": 0.376,
322
- "step": 470
323
- },
324
- {
325
- "epoch": 0.19,
326
- "learning_rate": 9.43752511048614e-05,
327
- "loss": 0.3304,
328
- "step": 480
329
- },
330
- {
331
- "epoch": 0.19,
332
- "learning_rate": 9.41743672157493e-05,
333
- "loss": 0.3644,
334
- "step": 490
335
- },
336
- {
337
- "epoch": 0.19,
338
- "learning_rate": 9.39734833266372e-05,
339
- "loss": 0.3618,
340
- "step": 500
341
- },
342
- {
343
- "epoch": 0.19,
344
- "eval_loss": 0.3597237765789032,
345
- "eval_runtime": 233.4866,
346
- "eval_samples_per_second": 2.141,
347
- "eval_steps_per_second": 0.069,
348
- "step": 500
349
- },
350
- {
351
- "epoch": 0.2,
352
- "learning_rate": 9.377259943752511e-05,
353
- "loss": 0.3855,
354
- "step": 510
355
- },
356
- {
357
- "epoch": 0.2,
358
- "learning_rate": 9.357171554841302e-05,
359
- "loss": 0.3652,
360
- "step": 520
361
- },
362
- {
363
- "epoch": 0.2,
364
- "learning_rate": 9.337083165930092e-05,
365
- "loss": 0.3621,
366
- "step": 530
367
- },
368
- {
369
- "epoch": 0.21,
370
- "learning_rate": 9.316994777018884e-05,
371
- "loss": 0.3318,
372
- "step": 540
373
- },
374
- {
375
- "epoch": 0.21,
376
- "learning_rate": 9.296906388107675e-05,
377
- "loss": 0.3409,
378
- "step": 550
379
- },
380
- {
381
- "epoch": 0.22,
382
- "learning_rate": 9.276817999196466e-05,
383
- "loss": 0.3336,
384
- "step": 560
385
- },
386
- {
387
- "epoch": 0.22,
388
- "learning_rate": 9.256729610285255e-05,
389
- "loss": 0.3505,
390
- "step": 570
391
- },
392
- {
393
- "epoch": 0.22,
394
- "learning_rate": 9.236641221374047e-05,
395
- "loss": 0.3354,
396
- "step": 580
397
- },
398
- {
399
- "epoch": 0.23,
400
- "learning_rate": 9.216552832462837e-05,
401
- "loss": 0.3619,
402
- "step": 590
403
- },
404
- {
405
- "epoch": 0.23,
406
- "learning_rate": 9.196464443551628e-05,
407
- "loss": 0.3491,
408
- "step": 600
409
- },
410
- {
411
- "epoch": 0.23,
412
- "eval_loss": 0.3491559624671936,
413
- "eval_runtime": 227.9072,
414
- "eval_samples_per_second": 2.194,
415
- "eval_steps_per_second": 0.07,
416
- "step": 600
417
- },
418
- {
419
- "epoch": 0.24,
420
- "learning_rate": 9.176376054640418e-05,
421
- "loss": 0.353,
422
- "step": 610
423
- },
424
- {
425
- "epoch": 0.24,
426
- "learning_rate": 9.156287665729209e-05,
427
- "loss": 0.3508,
428
- "step": 620
429
- },
430
- {
431
- "epoch": 0.24,
432
- "learning_rate": 9.136199276818e-05,
433
- "loss": 0.3637,
434
- "step": 630
435
- },
436
- {
437
- "epoch": 0.25,
438
- "learning_rate": 9.11611088790679e-05,
439
- "loss": 0.3297,
440
- "step": 640
441
- },
442
- {
443
- "epoch": 0.25,
444
- "learning_rate": 9.096022498995581e-05,
445
- "loss": 0.3578,
446
- "step": 650
447
- },
448
- {
449
- "epoch": 0.25,
450
- "learning_rate": 9.075934110084371e-05,
451
- "loss": 0.3327,
452
- "step": 660
453
- },
454
- {
455
- "epoch": 0.26,
456
- "learning_rate": 9.055845721173163e-05,
457
- "loss": 0.3336,
458
- "step": 670
459
- },
460
- {
461
- "epoch": 0.26,
462
- "learning_rate": 9.035757332261952e-05,
463
- "loss": 0.3113,
464
- "step": 680
465
- },
466
- {
467
- "epoch": 0.27,
468
- "learning_rate": 9.015668943350743e-05,
469
- "loss": 0.3278,
470
- "step": 690
471
- },
472
- {
473
- "epoch": 0.27,
474
- "learning_rate": 8.995580554439535e-05,
475
- "loss": 0.3358,
476
- "step": 700
477
- },
478
- {
479
- "epoch": 0.27,
480
- "eval_loss": 0.3494350016117096,
481
- "eval_runtime": 226.5818,
482
- "eval_samples_per_second": 2.207,
483
- "eval_steps_per_second": 0.071,
484
- "step": 700
485
- },
486
- {
487
- "epoch": 0.27,
488
- "learning_rate": 8.975492165528326e-05,
489
- "loss": 0.3417,
490
- "step": 710
491
- },
492
- {
493
- "epoch": 0.28,
494
- "learning_rate": 8.955403776617116e-05,
495
- "loss": 0.3451,
496
- "step": 720
497
- },
498
- {
499
- "epoch": 0.28,
500
- "learning_rate": 8.935315387705905e-05,
501
- "loss": 0.3298,
502
- "step": 730
503
- },
504
- {
505
- "epoch": 0.29,
506
- "learning_rate": 8.915226998794697e-05,
507
- "loss": 0.3188,
508
- "step": 740
509
- },
510
- {
511
- "epoch": 0.29,
512
- "learning_rate": 8.895138609883488e-05,
513
- "loss": 0.3211,
514
- "step": 750
515
- },
516
- {
517
- "epoch": 0.29,
518
- "learning_rate": 8.875050220972279e-05,
519
- "loss": 0.3324,
520
- "step": 760
521
- },
522
- {
523
- "epoch": 0.3,
524
- "learning_rate": 8.854961832061069e-05,
525
- "loss": 0.3269,
526
- "step": 770
527
- },
528
- {
529
- "epoch": 0.3,
530
- "learning_rate": 8.83487344314986e-05,
531
- "loss": 0.3308,
532
- "step": 780
533
- },
534
- {
535
- "epoch": 0.31,
536
- "learning_rate": 8.81478505423865e-05,
537
- "loss": 0.341,
538
- "step": 790
539
- },
540
- {
541
- "epoch": 0.31,
542
- "learning_rate": 8.794696665327441e-05,
543
- "loss": 0.3299,
544
- "step": 800
545
- },
546
- {
547
- "epoch": 0.31,
548
- "eval_loss": 0.3420061469078064,
549
- "eval_runtime": 222.718,
550
- "eval_samples_per_second": 2.245,
551
- "eval_steps_per_second": 0.072,
552
- "step": 800
553
- },
554
- {
555
- "epoch": 0.31,
556
- "learning_rate": 8.774608276416231e-05,
557
- "loss": 0.3289,
558
- "step": 810
559
- },
560
- {
561
- "epoch": 0.32,
562
- "learning_rate": 8.754519887505023e-05,
563
- "loss": 0.328,
564
- "step": 820
565
- },
566
- {
567
- "epoch": 0.32,
568
- "learning_rate": 8.734431498593814e-05,
569
- "loss": 0.3464,
570
- "step": 830
571
- },
572
- {
573
- "epoch": 0.32,
574
- "learning_rate": 8.714343109682603e-05,
575
- "loss": 0.3227,
576
- "step": 840
577
- },
578
- {
579
- "epoch": 0.33,
580
- "learning_rate": 8.694254720771394e-05,
581
- "loss": 0.352,
582
- "step": 850
583
- },
584
- {
585
- "epoch": 0.33,
586
- "learning_rate": 8.674166331860186e-05,
587
- "loss": 0.3555,
588
- "step": 860
589
- },
590
- {
591
- "epoch": 0.34,
592
- "learning_rate": 8.654077942948976e-05,
593
- "loss": 0.3322,
594
- "step": 870
595
- },
596
- {
597
- "epoch": 0.34,
598
- "learning_rate": 8.633989554037767e-05,
599
- "loss": 0.316,
600
- "step": 880
601
- },
602
- {
603
- "epoch": 0.34,
604
- "learning_rate": 8.613901165126557e-05,
605
- "loss": 0.304,
606
- "step": 890
607
- },
608
- {
609
- "epoch": 0.35,
610
- "learning_rate": 8.593812776215348e-05,
611
- "loss": 0.3257,
612
- "step": 900
613
- },
614
- {
615
- "epoch": 0.35,
616
- "eval_loss": 0.3412737250328064,
617
- "eval_runtime": 230.8687,
618
- "eval_samples_per_second": 2.166,
619
- "eval_steps_per_second": 0.069,
620
- "step": 900
621
- },
622
- {
623
- "epoch": 0.35,
624
- "learning_rate": 8.573724387304139e-05,
625
- "loss": 0.3339,
626
- "step": 910
627
- },
628
- {
629
- "epoch": 0.36,
630
- "learning_rate": 8.553635998392929e-05,
631
- "loss": 0.3163,
632
- "step": 920
633
- },
634
- {
635
- "epoch": 0.36,
636
- "learning_rate": 8.53354760948172e-05,
637
- "loss": 0.3223,
638
- "step": 930
639
- },
640
- {
641
- "epoch": 0.36,
642
- "learning_rate": 8.513459220570512e-05,
643
- "loss": 0.3423,
644
- "step": 940
645
- },
646
- {
647
- "epoch": 0.37,
648
- "learning_rate": 8.493370831659301e-05,
649
- "loss": 0.3175,
650
- "step": 950
651
- },
652
- {
653
- "epoch": 0.37,
654
- "learning_rate": 8.473282442748092e-05,
655
- "loss": 0.3249,
656
- "step": 960
657
- },
658
- {
659
- "epoch": 0.37,
660
- "learning_rate": 8.453194053836882e-05,
661
- "loss": 0.3095,
662
- "step": 970
663
- },
664
- {
665
- "epoch": 0.38,
666
- "learning_rate": 8.433105664925674e-05,
667
- "loss": 0.3324,
668
- "step": 980
669
- },
670
- {
671
- "epoch": 0.38,
672
- "learning_rate": 8.413017276014465e-05,
673
- "loss": 0.3276,
674
- "step": 990
675
- },
676
- {
677
- "epoch": 0.39,
678
- "learning_rate": 8.392928887103255e-05,
679
- "loss": 0.345,
680
- "step": 1000
681
- },
682
- {
683
- "epoch": 0.39,
684
- "eval_loss": 0.3368094265460968,
685
- "eval_runtime": 233.1619,
686
- "eval_samples_per_second": 2.144,
687
- "eval_steps_per_second": 0.069,
688
- "step": 1000
689
- },
690
- {
691
- "epoch": 0.39,
692
- "learning_rate": 8.372840498192044e-05,
693
- "loss": 0.3389,
694
- "step": 1010
695
- },
696
- {
697
- "epoch": 0.39,
698
- "learning_rate": 8.352752109280836e-05,
699
- "loss": 0.321,
700
- "step": 1020
701
- },
702
- {
703
- "epoch": 0.4,
704
- "learning_rate": 8.332663720369627e-05,
705
- "loss": 0.3497,
706
- "step": 1030
707
- },
708
- {
709
- "epoch": 0.4,
710
- "learning_rate": 8.312575331458418e-05,
711
- "loss": 0.3283,
712
- "step": 1040
713
- },
714
- {
715
- "epoch": 0.41,
716
- "learning_rate": 8.292486942547208e-05,
717
- "loss": 0.31,
718
- "step": 1050
719
- },
720
- {
721
- "epoch": 0.41,
722
- "learning_rate": 8.272398553635999e-05,
723
- "loss": 0.3325,
724
- "step": 1060
725
- },
726
- {
727
- "epoch": 0.41,
728
- "learning_rate": 8.252310164724789e-05,
729
- "loss": 0.3103,
730
- "step": 1070
731
- },
732
- {
733
- "epoch": 0.42,
734
- "learning_rate": 8.23222177581358e-05,
735
- "loss": 0.3301,
736
- "step": 1080
737
- },
738
- {
739
- "epoch": 0.42,
740
- "learning_rate": 8.21213338690237e-05,
741
- "loss": 0.3295,
742
- "step": 1090
743
- },
744
- {
745
- "epoch": 0.42,
746
- "learning_rate": 8.192044997991162e-05,
747
- "loss": 0.3301,
748
- "step": 1100
749
- },
750
- {
751
- "epoch": 0.42,
752
- "eval_loss": 0.3294503390789032,
753
- "eval_runtime": 233.058,
754
- "eval_samples_per_second": 2.145,
755
- "eval_steps_per_second": 0.069,
756
- "step": 1100
757
- },
758
- {
759
- "epoch": 0.43,
760
- "learning_rate": 8.171956609079953e-05,
761
- "loss": 0.3269,
762
- "step": 1110
763
- },
764
- {
765
- "epoch": 0.43,
766
- "learning_rate": 8.151868220168742e-05,
767
- "loss": 0.347,
768
- "step": 1120
769
- },
770
- {
771
- "epoch": 0.44,
772
- "learning_rate": 8.131779831257533e-05,
773
- "loss": 0.3189,
774
- "step": 1130
775
- },
776
- {
777
- "epoch": 0.44,
778
- "learning_rate": 8.111691442346325e-05,
779
- "loss": 0.3251,
780
- "step": 1140
781
- },
782
- {
783
- "epoch": 0.44,
784
- "learning_rate": 8.091603053435115e-05,
785
- "loss": 0.305,
786
- "step": 1150
787
- },
788
- {
789
- "epoch": 0.45,
790
- "learning_rate": 8.071514664523906e-05,
791
- "loss": 0.3054,
792
- "step": 1160
793
- },
794
- {
795
- "epoch": 0.45,
796
- "learning_rate": 8.051426275612696e-05,
797
- "loss": 0.3426,
798
- "step": 1170
799
- },
800
- {
801
- "epoch": 0.46,
802
- "learning_rate": 8.031337886701487e-05,
803
- "loss": 0.3206,
804
- "step": 1180
805
- },
806
- {
807
- "epoch": 0.46,
808
- "learning_rate": 8.011249497790278e-05,
809
- "loss": 0.334,
810
- "step": 1190
811
- },
812
- {
813
- "epoch": 0.46,
814
- "learning_rate": 7.991161108879068e-05,
815
- "loss": 0.3311,
816
- "step": 1200
817
- },
818
- {
819
- "epoch": 0.46,
820
- "eval_loss": 0.3330426812171936,
821
- "eval_runtime": 231.4394,
822
- "eval_samples_per_second": 2.16,
823
- "eval_steps_per_second": 0.069,
824
- "step": 1200
825
- },
826
- {
827
- "epoch": 0.47,
828
- "learning_rate": 7.971072719967859e-05,
829
- "loss": 0.3326,
830
- "step": 1210
831
- },
832
- {
833
- "epoch": 0.47,
834
- "learning_rate": 7.950984331056651e-05,
835
- "loss": 0.3229,
836
- "step": 1220
837
- },
838
- {
839
- "epoch": 0.48,
840
- "learning_rate": 7.93089594214544e-05,
841
- "loss": 0.3252,
842
- "step": 1230
843
- },
844
- {
845
- "epoch": 0.48,
846
- "learning_rate": 7.91080755323423e-05,
847
- "loss": 0.314,
848
- "step": 1240
849
- },
850
- {
851
- "epoch": 0.48,
852
- "learning_rate": 7.890719164323021e-05,
853
- "loss": 0.3191,
854
- "step": 1250
855
- },
856
- {
857
- "epoch": 0.49,
858
- "learning_rate": 7.870630775411813e-05,
859
- "loss": 0.3193,
860
- "step": 1260
861
- },
862
- {
863
- "epoch": 0.49,
864
- "learning_rate": 7.850542386500604e-05,
865
- "loss": 0.3183,
866
- "step": 1270
867
- },
868
- {
869
- "epoch": 0.49,
870
- "learning_rate": 7.830453997589394e-05,
871
- "loss": 0.3141,
872
- "step": 1280
873
- },
874
- {
875
- "epoch": 0.5,
876
- "learning_rate": 7.810365608678183e-05,
877
- "loss": 0.3069,
878
- "step": 1290
879
- },
880
- {
881
- "epoch": 0.5,
882
- "learning_rate": 7.790277219766975e-05,
883
- "loss": 0.3203,
884
- "step": 1300
885
- },
886
- {
887
- "epoch": 0.5,
888
- "eval_loss": 0.32666015625,
889
- "eval_runtime": 228.195,
890
- "eval_samples_per_second": 2.191,
891
- "eval_steps_per_second": 0.07,
892
- "step": 1300
893
- },
894
- {
895
- "epoch": 0.51,
896
- "learning_rate": 7.770188830855766e-05,
897
- "loss": 0.3044,
898
- "step": 1310
899
- },
900
- {
901
- "epoch": 0.51,
902
- "learning_rate": 7.750100441944557e-05,
903
- "loss": 0.3052,
904
- "step": 1320
905
- },
906
- {
907
- "epoch": 0.51,
908
- "learning_rate": 7.730012053033347e-05,
909
- "loss": 0.3156,
910
- "step": 1330
911
- },
912
- {
913
- "epoch": 0.52,
914
- "learning_rate": 7.709923664122138e-05,
915
- "loss": 0.3271,
916
- "step": 1340
917
- },
918
- {
919
- "epoch": 0.52,
920
- "learning_rate": 7.689835275210928e-05,
921
- "loss": 0.2991,
922
- "step": 1350
923
- },
924
- {
925
- "epoch": 0.53,
926
- "learning_rate": 7.669746886299719e-05,
927
- "loss": 0.306,
928
- "step": 1360
929
- },
930
- {
931
- "epoch": 0.53,
932
- "learning_rate": 7.64965849738851e-05,
933
- "loss": 0.3122,
934
- "step": 1370
935
- },
936
- {
937
- "epoch": 0.53,
938
- "learning_rate": 7.629570108477301e-05,
939
- "loss": 0.3101,
940
- "step": 1380
941
- },
942
- {
943
- "epoch": 0.54,
944
- "learning_rate": 7.609481719566092e-05,
945
- "loss": 0.3106,
946
- "step": 1390
947
- },
948
- {
949
- "epoch": 0.54,
950
- "learning_rate": 7.589393330654881e-05,
951
- "loss": 0.3177,
952
- "step": 1400
953
- },
954
- {
955
- "epoch": 0.54,
956
- "eval_loss": 0.3282645046710968,
957
- "eval_runtime": 228.5175,
958
- "eval_samples_per_second": 2.188,
959
- "eval_steps_per_second": 0.07,
960
- "step": 1400
961
- },
962
- {
963
- "epoch": 0.54,
964
- "learning_rate": 7.569304941743672e-05,
965
- "loss": 0.2955,
966
- "step": 1410
967
- },
968
- {
969
- "epoch": 0.55,
970
- "learning_rate": 7.549216552832464e-05,
971
- "loss": 0.3287,
972
- "step": 1420
973
- },
974
- {
975
- "epoch": 0.55,
976
- "learning_rate": 7.529128163921254e-05,
977
- "loss": 0.3224,
978
- "step": 1430
979
- },
980
- {
981
- "epoch": 0.56,
982
- "learning_rate": 7.509039775010045e-05,
983
- "loss": 0.3231,
984
- "step": 1440
985
- },
986
- {
987
- "epoch": 0.56,
988
- "learning_rate": 7.488951386098834e-05,
989
- "loss": 0.3218,
990
- "step": 1450
991
- },
992
- {
993
- "epoch": 0.56,
994
- "learning_rate": 7.468862997187626e-05,
995
- "loss": 0.3167,
996
- "step": 1460
997
- },
998
- {
999
- "epoch": 0.57,
1000
- "learning_rate": 7.448774608276417e-05,
1001
- "loss": 0.3252,
1002
- "step": 1470
1003
- },
1004
- {
1005
- "epoch": 0.57,
1006
- "learning_rate": 7.428686219365207e-05,
1007
- "loss": 0.315,
1008
- "step": 1480
1009
- },
1010
- {
1011
- "epoch": 0.58,
1012
- "learning_rate": 7.408597830453998e-05,
1013
- "loss": 0.3012,
1014
- "step": 1490
1015
- },
1016
- {
1017
- "epoch": 0.58,
1018
- "learning_rate": 7.38850944154279e-05,
1019
- "loss": 0.3055,
1020
- "step": 1500
1021
- },
1022
- {
1023
- "epoch": 0.58,
1024
- "eval_loss": 0.3236956000328064,
1025
- "eval_runtime": 235.4381,
1026
- "eval_samples_per_second": 2.124,
1027
- "eval_steps_per_second": 0.068,
1028
- "step": 1500
1029
- },
1030
- {
1031
- "epoch": 0.58,
1032
- "learning_rate": 7.368421052631579e-05,
1033
- "loss": 0.3194,
1034
- "step": 1510
1035
- },
1036
- {
1037
- "epoch": 0.59,
1038
- "learning_rate": 7.34833266372037e-05,
1039
- "loss": 0.3254,
1040
- "step": 1520
1041
- },
1042
- {
1043
- "epoch": 0.59,
1044
- "learning_rate": 7.32824427480916e-05,
1045
- "loss": 0.3108,
1046
- "step": 1530
1047
- },
1048
- {
1049
- "epoch": 0.59,
1050
- "learning_rate": 7.308155885897952e-05,
1051
- "loss": 0.3029,
1052
- "step": 1540
1053
- },
1054
- {
1055
- "epoch": 0.6,
1056
- "learning_rate": 7.288067496986743e-05,
1057
- "loss": 0.319,
1058
- "step": 1550
1059
- },
1060
- {
1061
- "epoch": 0.6,
1062
- "learning_rate": 7.267979108075532e-05,
1063
- "loss": 0.3114,
1064
- "step": 1560
1065
- },
1066
- {
1067
- "epoch": 0.61,
1068
- "learning_rate": 7.247890719164322e-05,
1069
- "loss": 0.2944,
1070
- "step": 1570
1071
- },
1072
- {
1073
- "epoch": 0.61,
1074
- "learning_rate": 7.227802330253114e-05,
1075
- "loss": 0.3255,
1076
- "step": 1580
1077
- },
1078
- {
1079
- "epoch": 0.61,
1080
- "learning_rate": 7.207713941341905e-05,
1081
- "loss": 0.321,
1082
- "step": 1590
1083
- },
1084
- {
1085
- "epoch": 0.62,
1086
- "learning_rate": 7.187625552430696e-05,
1087
- "loss": 0.3235,
1088
- "step": 1600
1089
- },
1090
- {
1091
- "epoch": 0.62,
1092
- "eval_loss": 0.3245675265789032,
1093
- "eval_runtime": 229.0197,
1094
- "eval_samples_per_second": 2.183,
1095
- "eval_steps_per_second": 0.07,
1096
- "step": 1600
1097
- },
1098
- {
1099
- "epoch": 0.62,
1100
- "learning_rate": 7.167537163519486e-05,
1101
- "loss": 0.2904,
1102
- "step": 1610
1103
- },
1104
- {
1105
- "epoch": 0.63,
1106
- "learning_rate": 7.147448774608277e-05,
1107
- "loss": 0.3511,
1108
- "step": 1620
1109
- },
1110
- {
1111
- "epoch": 0.63,
1112
- "learning_rate": 7.127360385697067e-05,
1113
- "loss": 0.3088,
1114
- "step": 1630
1115
- },
1116
- {
1117
- "epoch": 0.63,
1118
- "learning_rate": 7.107271996785858e-05,
1119
- "loss": 0.3015,
1120
- "step": 1640
1121
- },
1122
- {
1123
- "epoch": 0.64,
1124
- "learning_rate": 7.087183607874648e-05,
1125
- "loss": 0.3,
1126
- "step": 1650
1127
- },
1128
- {
1129
- "epoch": 0.64,
1130
- "learning_rate": 7.06709521896344e-05,
1131
- "loss": 0.3487,
1132
- "step": 1660
1133
- },
1134
- {
1135
- "epoch": 0.65,
1136
- "learning_rate": 7.04700683005223e-05,
1137
- "loss": 0.3029,
1138
- "step": 1670
1139
- },
1140
- {
1141
- "epoch": 0.65,
1142
- "learning_rate": 7.02691844114102e-05,
1143
- "loss": 0.307,
1144
- "step": 1680
1145
- },
1146
- {
1147
- "epoch": 0.65,
1148
- "learning_rate": 7.006830052229811e-05,
1149
- "loss": 0.3026,
1150
- "step": 1690
1151
- },
1152
- {
1153
- "epoch": 0.66,
1154
- "learning_rate": 6.986741663318603e-05,
1155
- "loss": 0.3203,
1156
- "step": 1700
1157
- },
1158
- {
1159
- "epoch": 0.66,
1160
- "eval_loss": 0.3210797905921936,
1161
- "eval_runtime": 235.7603,
1162
- "eval_samples_per_second": 2.121,
1163
- "eval_steps_per_second": 0.068,
1164
- "step": 1700
1165
- },
1166
- {
1167
- "epoch": 0.66,
1168
- "learning_rate": 6.966653274407393e-05,
1169
- "loss": 0.3036,
1170
- "step": 1710
1171
- },
1172
- {
1173
- "epoch": 0.66,
1174
- "learning_rate": 6.946564885496184e-05,
1175
- "loss": 0.32,
1176
- "step": 1720
1177
- },
1178
- {
1179
- "epoch": 0.67,
1180
- "learning_rate": 6.926476496584973e-05,
1181
- "loss": 0.3065,
1182
- "step": 1730
1183
- },
1184
- {
1185
- "epoch": 0.67,
1186
- "learning_rate": 6.906388107673765e-05,
1187
- "loss": 0.2972,
1188
- "step": 1740
1189
- },
1190
- {
1191
- "epoch": 0.68,
1192
- "learning_rate": 6.886299718762556e-05,
1193
- "loss": 0.2972,
1194
- "step": 1750
1195
- },
1196
- {
1197
- "epoch": 0.68,
1198
- "learning_rate": 6.866211329851346e-05,
1199
- "loss": 0.321,
1200
- "step": 1760
1201
- },
1202
- {
1203
- "epoch": 0.68,
1204
- "learning_rate": 6.846122940940137e-05,
1205
- "loss": 0.3206,
1206
- "step": 1770
1207
- },
1208
- {
1209
- "epoch": 0.69,
1210
- "learning_rate": 6.826034552028927e-05,
1211
- "loss": 0.304,
1212
- "step": 1780
1213
- },
1214
- {
1215
- "epoch": 0.69,
1216
- "learning_rate": 6.805946163117718e-05,
1217
- "loss": 0.3101,
1218
- "step": 1790
1219
- },
1220
- {
1221
- "epoch": 0.7,
1222
- "learning_rate": 6.785857774206509e-05,
1223
- "loss": 0.3263,
1224
- "step": 1800
1225
- },
1226
- {
1227
- "epoch": 0.7,
1228
- "eval_loss": 0.3179059624671936,
1229
- "eval_runtime": 228.96,
1230
- "eval_samples_per_second": 2.184,
1231
- "eval_steps_per_second": 0.07,
1232
- "step": 1800
1233
- },
1234
- {
1235
- "epoch": 0.7,
1236
- "learning_rate": 6.765769385295299e-05,
1237
- "loss": 0.3311,
1238
- "step": 1810
1239
- },
1240
- {
1241
- "epoch": 0.7,
1242
- "learning_rate": 6.745680996384091e-05,
1243
- "loss": 0.3047,
1244
- "step": 1820
1245
- },
1246
- {
1247
- "epoch": 0.71,
1248
- "learning_rate": 6.725592607472882e-05,
1249
- "loss": 0.3085,
1250
- "step": 1830
1251
- },
1252
- {
1253
- "epoch": 0.71,
1254
- "learning_rate": 6.705504218561671e-05,
1255
- "loss": 0.3083,
1256
- "step": 1840
1257
- },
1258
- {
1259
- "epoch": 0.71,
1260
- "learning_rate": 6.685415829650461e-05,
1261
- "loss": 0.3225,
1262
- "step": 1850
1263
- },
1264
- {
1265
- "epoch": 0.72,
1266
- "learning_rate": 6.665327440739253e-05,
1267
- "loss": 0.3136,
1268
- "step": 1860
1269
- },
1270
- {
1271
- "epoch": 0.72,
1272
- "learning_rate": 6.645239051828044e-05,
1273
- "loss": 0.3191,
1274
- "step": 1870
1275
- },
1276
- {
1277
- "epoch": 0.73,
1278
- "learning_rate": 6.625150662916835e-05,
1279
- "loss": 0.3547,
1280
- "step": 1880
1281
- },
1282
- {
1283
- "epoch": 0.73,
1284
- "learning_rate": 6.605062274005625e-05,
1285
- "loss": 0.3466,
1286
- "step": 1890
1287
- },
1288
- {
1289
- "epoch": 0.73,
1290
- "learning_rate": 6.584973885094416e-05,
1291
- "loss": 0.2985,
1292
- "step": 1900
1293
- },
1294
- {
1295
- "epoch": 0.73,
1296
- "eval_loss": 0.324951171875,
1297
- "eval_runtime": 228.2301,
1298
- "eval_samples_per_second": 2.191,
1299
- "eval_steps_per_second": 0.07,
1300
- "step": 1900
1301
- },
1302
- {
1303
- "epoch": 0.74,
1304
- "learning_rate": 6.564885496183206e-05,
1305
- "loss": 0.3164,
1306
- "step": 1910
1307
- },
1308
- {
1309
- "epoch": 0.74,
1310
- "learning_rate": 6.544797107271997e-05,
1311
- "loss": 0.3171,
1312
- "step": 1920
1313
- },
1314
- {
1315
- "epoch": 0.75,
1316
- "learning_rate": 6.524708718360787e-05,
1317
- "loss": 0.2916,
1318
- "step": 1930
1319
- },
1320
- {
1321
- "epoch": 0.75,
1322
- "learning_rate": 6.50462032944958e-05,
1323
- "loss": 0.3082,
1324
- "step": 1940
1325
- },
1326
- {
1327
- "epoch": 0.75,
1328
- "learning_rate": 6.484531940538369e-05,
1329
- "loss": 0.308,
1330
- "step": 1950
1331
- },
1332
- {
1333
- "epoch": 0.76,
1334
- "learning_rate": 6.464443551627159e-05,
1335
- "loss": 0.3085,
1336
- "step": 1960
1337
- },
1338
- {
1339
- "epoch": 0.76,
1340
- "learning_rate": 6.44435516271595e-05,
1341
- "loss": 0.3062,
1342
- "step": 1970
1343
- },
1344
- {
1345
- "epoch": 0.76,
1346
- "learning_rate": 6.424266773804742e-05,
1347
- "loss": 0.3214,
1348
- "step": 1980
1349
- },
1350
- {
1351
- "epoch": 0.77,
1352
- "learning_rate": 6.404178384893532e-05,
1353
- "loss": 0.3046,
1354
- "step": 1990
1355
- },
1356
- {
1357
- "epoch": 0.77,
1358
- "learning_rate": 6.384089995982323e-05,
1359
- "loss": 0.3161,
1360
- "step": 2000
1361
- },
1362
- {
1363
- "epoch": 0.77,
1364
- "eval_loss": 0.3216029703617096,
1365
- "eval_runtime": 230.2587,
1366
- "eval_samples_per_second": 2.171,
1367
- "eval_steps_per_second": 0.069,
1368
- "step": 2000
1369
- },
1370
- {
1371
- "epoch": 0.78,
1372
- "learning_rate": 6.364001607071113e-05,
1373
- "loss": 0.3274,
1374
- "step": 2010
1375
- },
1376
- {
1377
- "epoch": 0.78,
1378
- "learning_rate": 6.343913218159904e-05,
1379
- "loss": 0.341,
1380
- "step": 2020
1381
- },
1382
- {
1383
- "epoch": 0.78,
1384
- "learning_rate": 6.323824829248695e-05,
1385
- "loss": 0.3226,
1386
- "step": 2030
1387
- },
1388
- {
1389
- "epoch": 0.79,
1390
- "learning_rate": 6.303736440337485e-05,
1391
- "loss": 0.3187,
1392
- "step": 2040
1393
- },
1394
- {
1395
- "epoch": 0.79,
1396
- "learning_rate": 6.283648051426276e-05,
1397
- "loss": 0.3003,
1398
- "step": 2050
1399
- },
1400
- {
1401
- "epoch": 0.8,
1402
- "learning_rate": 6.263559662515066e-05,
1403
- "loss": 0.2849,
1404
- "step": 2060
1405
- },
1406
- {
1407
- "epoch": 0.8,
1408
- "learning_rate": 6.243471273603857e-05,
1409
- "loss": 0.3096,
1410
- "step": 2070
1411
- },
1412
- {
1413
- "epoch": 0.8,
1414
- "learning_rate": 6.223382884692648e-05,
1415
- "loss": 0.2993,
1416
- "step": 2080
1417
- },
1418
- {
1419
- "epoch": 0.81,
1420
- "learning_rate": 6.203294495781438e-05,
1421
- "loss": 0.2984,
1422
- "step": 2090
1423
- },
1424
- {
1425
- "epoch": 0.81,
1426
- "learning_rate": 6.18320610687023e-05,
1427
- "loss": 0.2987,
1428
- "step": 2100
1429
- },
1430
- {
1431
- "epoch": 0.81,
1432
- "eval_loss": 0.3201729953289032,
1433
- "eval_runtime": 227.0048,
1434
- "eval_samples_per_second": 2.203,
1435
- "eval_steps_per_second": 0.07,
1436
- "step": 2100
1437
- },
1438
- {
1439
- "epoch": 0.81,
1440
- "learning_rate": 6.16311771795902e-05,
1441
- "loss": 0.3092,
1442
- "step": 2110
1443
- },
1444
- {
1445
- "epoch": 0.82,
1446
- "learning_rate": 6.14302932904781e-05,
1447
- "loss": 0.2995,
1448
- "step": 2120
1449
- },
1450
- {
1451
- "epoch": 0.82,
1452
- "learning_rate": 6.1229409401366e-05,
1453
- "loss": 0.3114,
1454
- "step": 2130
1455
- },
1456
- {
1457
- "epoch": 0.83,
1458
- "learning_rate": 6.102852551225392e-05,
1459
- "loss": 0.3077,
1460
- "step": 2140
1461
- },
1462
- {
1463
- "epoch": 0.83,
1464
- "learning_rate": 6.082764162314183e-05,
1465
- "loss": 0.3121,
1466
- "step": 2150
1467
- },
1468
- {
1469
- "epoch": 0.83,
1470
- "learning_rate": 6.0626757734029736e-05,
1471
- "loss": 0.3042,
1472
- "step": 2160
1473
- },
1474
- {
1475
- "epoch": 0.84,
1476
- "learning_rate": 6.0425873844917635e-05,
1477
- "loss": 0.3129,
1478
- "step": 2170
1479
- },
1480
- {
1481
- "epoch": 0.84,
1482
- "learning_rate": 6.022498995580555e-05,
1483
- "loss": 0.2843,
1484
- "step": 2180
1485
- },
1486
- {
1487
- "epoch": 0.85,
1488
- "learning_rate": 6.002410606669345e-05,
1489
- "loss": 0.2973,
1490
- "step": 2190
1491
- },
1492
- {
1493
- "epoch": 0.85,
1494
- "learning_rate": 5.982322217758136e-05,
1495
- "loss": 0.3405,
1496
- "step": 2200
1497
- },
1498
- {
1499
- "epoch": 0.85,
1500
- "eval_loss": 0.3213588297367096,
1501
- "eval_runtime": 231.1731,
1502
- "eval_samples_per_second": 2.163,
1503
- "eval_steps_per_second": 0.069,
1504
- "step": 2200
1505
- },
1506
- {
1507
- "epoch": 0.85,
1508
- "learning_rate": 5.962233828846927e-05,
1509
- "loss": 0.2974,
1510
- "step": 2210
1511
- },
1512
- {
1513
- "epoch": 0.86,
1514
- "learning_rate": 5.942145439935718e-05,
1515
- "loss": 0.2978,
1516
- "step": 2220
1517
- },
1518
- {
1519
- "epoch": 0.86,
1520
- "learning_rate": 5.9220570510245076e-05,
1521
- "loss": 0.2922,
1522
- "step": 2230
1523
- },
1524
- {
1525
- "epoch": 0.87,
1526
- "learning_rate": 5.901968662113299e-05,
1527
- "loss": 0.321,
1528
- "step": 2240
1529
- },
1530
- {
1531
- "epoch": 0.87,
1532
- "learning_rate": 5.8818802732020895e-05,
1533
- "loss": 0.2926,
1534
- "step": 2250
1535
- },
1536
- {
1537
- "epoch": 0.87,
1538
- "learning_rate": 5.86179188429088e-05,
1539
- "loss": 0.3175,
1540
- "step": 2260
1541
- },
1542
- {
1543
- "epoch": 0.88,
1544
- "learning_rate": 5.841703495379671e-05,
1545
- "loss": 0.2816,
1546
- "step": 2270
1547
- },
1548
- {
1549
- "epoch": 0.88,
1550
- "learning_rate": 5.821615106468461e-05,
1551
- "loss": 0.2761,
1552
- "step": 2280
1553
- },
1554
- {
1555
- "epoch": 0.88,
1556
- "learning_rate": 5.801526717557252e-05,
1557
- "loss": 0.2786,
1558
- "step": 2290
1559
- },
1560
- {
1561
- "epoch": 0.89,
1562
- "learning_rate": 5.781438328646043e-05,
1563
- "loss": 0.295,
1564
- "step": 2300
1565
- },
1566
- {
1567
- "epoch": 0.89,
1568
- "eval_loss": 0.3176967203617096,
1569
- "eval_runtime": 229.1487,
1570
- "eval_samples_per_second": 2.182,
1571
- "eval_steps_per_second": 0.07,
1572
- "step": 2300
1573
- },
1574
- {
1575
- "epoch": 0.89,
1576
- "learning_rate": 5.7613499397348337e-05,
1577
- "loss": 0.3299,
1578
- "step": 2310
1579
- },
1580
- {
1581
- "epoch": 0.9,
1582
- "learning_rate": 5.741261550823624e-05,
1583
- "loss": 0.3122,
1584
- "step": 2320
1585
- },
1586
- {
1587
- "epoch": 0.9,
1588
- "learning_rate": 5.7211731619124155e-05,
1589
- "loss": 0.3288,
1590
- "step": 2330
1591
- },
1592
- {
1593
- "epoch": 0.9,
1594
- "learning_rate": 5.7010847730012054e-05,
1595
- "loss": 0.34,
1596
- "step": 2340
1597
- },
1598
- {
1599
- "epoch": 0.91,
1600
- "learning_rate": 5.680996384089996e-05,
1601
- "loss": 0.3161,
1602
- "step": 2350
1603
- },
1604
- {
1605
- "epoch": 0.91,
1606
- "learning_rate": 5.6609079951787866e-05,
1607
- "loss": 0.2696,
1608
- "step": 2360
1609
- },
1610
- {
1611
- "epoch": 0.92,
1612
- "learning_rate": 5.640819606267578e-05,
1613
- "loss": 0.3285,
1614
- "step": 2370
1615
- },
1616
- {
1617
- "epoch": 0.92,
1618
- "learning_rate": 5.6207312173563684e-05,
1619
- "loss": 0.3158,
1620
- "step": 2380
1621
- },
1622
- {
1623
- "epoch": 0.92,
1624
- "learning_rate": 5.600642828445158e-05,
1625
- "loss": 0.3229,
1626
- "step": 2390
1627
- },
1628
- {
1629
- "epoch": 0.93,
1630
- "learning_rate": 5.5805544395339496e-05,
1631
- "loss": 0.3168,
1632
- "step": 2400
1633
- },
1634
- {
1635
- "epoch": 0.93,
1636
- "eval_loss": 0.3182547390460968,
1637
- "eval_runtime": 234.4281,
1638
- "eval_samples_per_second": 2.133,
1639
- "eval_steps_per_second": 0.068,
1640
- "step": 2400
1641
- },
1642
- {
1643
- "epoch": 0.93,
1644
- "learning_rate": 5.56046605062274e-05,
1645
- "loss": 0.3167,
1646
- "step": 2410
1647
- },
1648
- {
1649
- "epoch": 0.93,
1650
- "learning_rate": 5.540377661711531e-05,
1651
- "loss": 0.3128,
1652
- "step": 2420
1653
- },
1654
- {
1655
- "epoch": 0.94,
1656
- "learning_rate": 5.520289272800322e-05,
1657
- "loss": 0.293,
1658
- "step": 2430
1659
- },
1660
- {
1661
- "epoch": 0.94,
1662
- "learning_rate": 5.5002008838891126e-05,
1663
- "loss": 0.3113,
1664
- "step": 2440
1665
- },
1666
- {
1667
- "epoch": 0.95,
1668
- "learning_rate": 5.4801124949779025e-05,
1669
- "loss": 0.3167,
1670
- "step": 2450
1671
- },
1672
- {
1673
- "epoch": 0.95,
1674
- "learning_rate": 5.460024106066694e-05,
1675
- "loss": 0.3119,
1676
- "step": 2460
1677
- },
1678
- {
1679
- "epoch": 0.95,
1680
- "learning_rate": 5.439935717155484e-05,
1681
- "loss": 0.295,
1682
- "step": 2470
1683
- },
1684
- {
1685
- "epoch": 0.96,
1686
- "learning_rate": 5.419847328244275e-05,
1687
- "loss": 0.303,
1688
- "step": 2480
1689
- },
1690
- {
1691
- "epoch": 0.96,
1692
- "learning_rate": 5.399758939333066e-05,
1693
- "loss": 0.3183,
1694
- "step": 2490
1695
- },
1696
- {
1697
- "epoch": 0.97,
1698
- "learning_rate": 5.379670550421857e-05,
1699
- "loss": 0.3159,
1700
- "step": 2500
1701
- },
1702
- {
1703
- "epoch": 0.97,
1704
- "eval_loss": 0.3175223171710968,
1705
- "eval_runtime": 242.5362,
1706
- "eval_samples_per_second": 2.062,
1707
- "eval_steps_per_second": 0.066,
1708
- "step": 2500
1709
- },
1710
- {
1711
- "epoch": 0.97,
1712
- "learning_rate": 5.3595821615106466e-05,
1713
- "loss": 0.3086,
1714
- "step": 2510
1715
- },
1716
- {
1717
- "epoch": 0.97,
1718
- "learning_rate": 5.339493772599438e-05,
1719
- "loss": 0.3184,
1720
- "step": 2520
1721
- },
1722
- {
1723
- "epoch": 0.98,
1724
- "learning_rate": 5.3194053836882285e-05,
1725
- "loss": 0.3131,
1726
- "step": 2530
1727
- },
1728
- {
1729
- "epoch": 0.98,
1730
- "learning_rate": 5.299316994777019e-05,
1731
- "loss": 0.3099,
1732
- "step": 2540
1733
- },
1734
- {
1735
- "epoch": 0.98,
1736
- "learning_rate": 5.27922860586581e-05,
1737
- "loss": 0.3392,
1738
- "step": 2550
1739
- },
1740
- {
1741
- "epoch": 0.99,
1742
- "learning_rate": 5.2591402169546e-05,
1743
- "loss": 0.3207,
1744
- "step": 2560
1745
- },
1746
- {
1747
- "epoch": 0.99,
1748
- "learning_rate": 5.239051828043391e-05,
1749
- "loss": 0.307,
1750
- "step": 2570
1751
- },
1752
- {
1753
- "epoch": 1.0,
1754
- "learning_rate": 5.218963439132182e-05,
1755
- "loss": 0.2946,
1756
- "step": 2580
1757
- },
1758
- {
1759
- "epoch": 1.0,
1760
- "learning_rate": 5.198875050220973e-05,
1761
- "loss": 0.3138,
1762
- "step": 2590
1763
- },
1764
- {
1765
- "epoch": 1.0,
1766
- "learning_rate": 5.178786661309763e-05,
1767
- "loss": 0.2704,
1768
- "step": 2600
1769
- },
1770
- {
1771
- "epoch": 1.0,
1772
- "eval_loss": 0.31640625,
1773
- "eval_runtime": 234.6863,
1774
- "eval_samples_per_second": 2.131,
1775
- "eval_steps_per_second": 0.068,
1776
- "step": 2600
1777
- },
1778
- {
1779
- "epoch": 1.01,
1780
- "learning_rate": 5.1586982723985545e-05,
1781
- "loss": 0.2552,
1782
- "step": 2610
1783
- },
1784
- {
1785
- "epoch": 1.01,
1786
- "learning_rate": 5.1386098834873444e-05,
1787
- "loss": 0.2701,
1788
- "step": 2620
1789
- },
1790
- {
1791
- "epoch": 1.02,
1792
- "learning_rate": 5.118521494576135e-05,
1793
- "loss": 0.2582,
1794
- "step": 2630
1795
- },
1796
- {
1797
- "epoch": 1.02,
1798
- "learning_rate": 5.098433105664926e-05,
1799
- "loss": 0.2592,
1800
- "step": 2640
1801
- },
1802
- {
1803
- "epoch": 1.02,
1804
- "learning_rate": 5.078344716753717e-05,
1805
- "loss": 0.2804,
1806
- "step": 2650
1807
- },
1808
- {
1809
- "epoch": 1.03,
1810
- "learning_rate": 5.0582563278425074e-05,
1811
- "loss": 0.2553,
1812
- "step": 2660
1813
- },
1814
- {
1815
- "epoch": 1.03,
1816
- "learning_rate": 5.038167938931297e-05,
1817
- "loss": 0.2541,
1818
- "step": 2670
1819
- },
1820
- {
1821
- "epoch": 1.04,
1822
- "learning_rate": 5.0180795500200886e-05,
1823
- "loss": 0.2536,
1824
- "step": 2680
1825
- },
1826
- {
1827
- "epoch": 1.04,
1828
- "learning_rate": 4.997991161108879e-05,
1829
- "loss": 0.2612,
1830
- "step": 2690
1831
- },
1832
- {
1833
- "epoch": 1.04,
1834
- "learning_rate": 4.9779027721976704e-05,
1835
- "loss": 0.278,
1836
- "step": 2700
1837
- },
1838
- {
1839
- "epoch": 1.04,
1840
- "eval_loss": 0.3170689046382904,
1841
- "eval_runtime": 234.3707,
1842
- "eval_samples_per_second": 2.133,
1843
- "eval_steps_per_second": 0.068,
1844
- "step": 2700
1845
- },
1846
- {
1847
- "epoch": 1.05,
1848
- "learning_rate": 4.95781438328646e-05,
1849
- "loss": 0.2573,
1850
- "step": 2710
1851
- },
1852
- {
1853
- "epoch": 1.05,
1854
- "learning_rate": 4.9377259943752516e-05,
1855
- "loss": 0.2553,
1856
- "step": 2720
1857
- },
1858
- {
1859
- "epoch": 1.05,
1860
- "learning_rate": 4.917637605464042e-05,
1861
- "loss": 0.2675,
1862
- "step": 2730
1863
- },
1864
- {
1865
- "epoch": 1.06,
1866
- "learning_rate": 4.897549216552833e-05,
1867
- "loss": 0.2842,
1868
- "step": 2740
1869
- },
1870
- {
1871
- "epoch": 1.06,
1872
- "learning_rate": 4.877460827641623e-05,
1873
- "loss": 0.2523,
1874
- "step": 2750
1875
- },
1876
- {
1877
- "epoch": 1.07,
1878
- "learning_rate": 4.8573724387304146e-05,
1879
- "loss": 0.2698,
1880
- "step": 2760
1881
- },
1882
- {
1883
- "epoch": 1.07,
1884
- "learning_rate": 4.8372840498192045e-05,
1885
- "loss": 0.2911,
1886
- "step": 2770
1887
- },
1888
- {
1889
- "epoch": 1.07,
1890
- "learning_rate": 4.817195660907996e-05,
1891
- "loss": 0.2909,
1892
- "step": 2780
1893
- },
1894
- {
1895
- "epoch": 1.08,
1896
- "learning_rate": 4.7971072719967863e-05,
1897
- "loss": 0.2753,
1898
- "step": 2790
1899
- },
1900
- {
1901
- "epoch": 1.08,
1902
- "learning_rate": 4.777018883085577e-05,
1903
- "loss": 0.2703,
1904
- "step": 2800
1905
- },
1906
- {
1907
- "epoch": 1.08,
1908
- "eval_loss": 0.312744140625,
1909
- "eval_runtime": 232.5523,
1910
- "eval_samples_per_second": 2.15,
1911
- "eval_steps_per_second": 0.069,
1912
- "step": 2800
1913
- },
1914
- {
1915
- "epoch": 1.09,
1916
- "learning_rate": 4.7569304941743675e-05,
1917
- "loss": 0.2798,
1918
- "step": 2810
1919
- },
1920
- {
1921
- "epoch": 1.09,
1922
- "learning_rate": 4.736842105263158e-05,
1923
- "loss": 0.2738,
1924
- "step": 2820
1925
- },
1926
- {
1927
- "epoch": 1.09,
1928
- "learning_rate": 4.716753716351949e-05,
1929
- "loss": 0.2757,
1930
- "step": 2830
1931
- },
1932
- {
1933
- "epoch": 1.1,
1934
- "learning_rate": 4.69666532744074e-05,
1935
- "loss": 0.2441,
1936
- "step": 2840
1937
- },
1938
- {
1939
- "epoch": 1.1,
1940
- "learning_rate": 4.67657693852953e-05,
1941
- "loss": 0.2597,
1942
- "step": 2850
1943
- },
1944
- {
1945
- "epoch": 1.1,
1946
- "learning_rate": 4.656488549618321e-05,
1947
- "loss": 0.2546,
1948
- "step": 2860
1949
- },
1950
- {
1951
- "epoch": 1.11,
1952
- "learning_rate": 4.636400160707112e-05,
1953
- "loss": 0.2567,
1954
- "step": 2870
1955
- },
1956
- {
1957
- "epoch": 1.11,
1958
- "learning_rate": 4.616311771795902e-05,
1959
- "loss": 0.2677,
1960
- "step": 2880
1961
- },
1962
- {
1963
- "epoch": 1.12,
1964
- "learning_rate": 4.596223382884693e-05,
1965
- "loss": 0.2544,
1966
- "step": 2890
1967
- },
1968
- {
1969
- "epoch": 1.12,
1970
- "learning_rate": 4.576134993973484e-05,
1971
- "loss": 0.2596,
1972
- "step": 2900
1973
- },
1974
- {
1975
- "epoch": 1.12,
1976
- "eval_loss": 0.312744140625,
1977
- "eval_runtime": 234.4348,
1978
- "eval_samples_per_second": 2.133,
1979
- "eval_steps_per_second": 0.068,
1980
- "step": 2900
1981
- },
1982
- {
1983
- "epoch": 1.12,
1984
- "learning_rate": 4.556046605062274e-05,
1985
- "loss": 0.2678,
1986
- "step": 2910
1987
- },
1988
- {
1989
- "epoch": 1.13,
1990
- "learning_rate": 4.535958216151065e-05,
1991
- "loss": 0.2741,
1992
- "step": 2920
1993
- },
1994
- {
1995
- "epoch": 1.13,
1996
- "learning_rate": 4.515869827239856e-05,
1997
- "loss": 0.2664,
1998
- "step": 2930
1999
- },
2000
- {
2001
- "epoch": 1.14,
2002
- "learning_rate": 4.4957814383286464e-05,
2003
- "loss": 0.2743,
2004
- "step": 2940
2005
- },
2006
- {
2007
- "epoch": 1.14,
2008
- "learning_rate": 4.475693049417437e-05,
2009
- "loss": 0.2587,
2010
- "step": 2950
2011
- },
2012
- {
2013
- "epoch": 1.14,
2014
- "learning_rate": 4.4556046605062276e-05,
2015
- "loss": 0.2744,
2016
- "step": 2960
2017
- },
2018
- {
2019
- "epoch": 1.15,
2020
- "learning_rate": 4.435516271595018e-05,
2021
- "loss": 0.2637,
2022
- "step": 2970
2023
- },
2024
- {
2025
- "epoch": 1.15,
2026
- "learning_rate": 4.4154278826838094e-05,
2027
- "loss": 0.2604,
2028
- "step": 2980
2029
- },
2030
- {
2031
- "epoch": 1.15,
2032
- "learning_rate": 4.395339493772599e-05,
2033
- "loss": 0.2609,
2034
- "step": 2990
2035
- },
2036
- {
2037
- "epoch": 1.16,
2038
- "learning_rate": 4.3752511048613906e-05,
2039
- "loss": 0.2564,
2040
- "step": 3000
2041
- },
2042
- {
2043
- "epoch": 1.16,
2044
- "eval_loss": 0.317138671875,
2045
- "eval_runtime": 236.9531,
2046
- "eval_samples_per_second": 2.11,
2047
- "eval_steps_per_second": 0.068,
2048
- "step": 3000
2049
- },
2050
- {
2051
- "epoch": 1.16,
2052
- "learning_rate": 4.355162715950181e-05,
2053
- "loss": 0.2585,
2054
- "step": 3010
2055
- },
2056
- {
2057
- "epoch": 1.17,
2058
- "learning_rate": 4.335074327038972e-05,
2059
- "loss": 0.272,
2060
- "step": 3020
2061
- },
2062
- {
2063
- "epoch": 1.17,
2064
- "learning_rate": 4.3149859381277623e-05,
2065
- "loss": 0.2593,
2066
- "step": 3030
2067
- },
2068
- {
2069
- "epoch": 1.17,
2070
- "learning_rate": 4.2948975492165536e-05,
2071
- "loss": 0.2658,
2072
- "step": 3040
2073
- },
2074
- {
2075
- "epoch": 1.18,
2076
- "learning_rate": 4.2748091603053435e-05,
2077
- "loss": 0.2643,
2078
- "step": 3050
2079
- },
2080
- {
2081
- "epoch": 1.18,
2082
- "learning_rate": 4.254720771394135e-05,
2083
- "loss": 0.2631,
2084
- "step": 3060
2085
- },
2086
- {
2087
- "epoch": 1.19,
2088
- "learning_rate": 4.234632382482925e-05,
2089
- "loss": 0.2376,
2090
- "step": 3070
2091
- },
2092
- {
2093
- "epoch": 1.19,
2094
- "learning_rate": 4.214543993571716e-05,
2095
- "loss": 0.2645,
2096
- "step": 3080
2097
- },
2098
- {
2099
- "epoch": 1.19,
2100
- "learning_rate": 4.1944556046605065e-05,
2101
- "loss": 0.2903,
2102
- "step": 3090
2103
- },
2104
- {
2105
- "epoch": 1.2,
2106
- "learning_rate": 4.174367215749297e-05,
2107
- "loss": 0.2502,
2108
- "step": 3100
2109
- },
2110
- {
2111
- "epoch": 1.2,
2112
- "eval_loss": 0.3148367702960968,
2113
- "eval_runtime": 232.416,
2114
- "eval_samples_per_second": 2.151,
2115
- "eval_steps_per_second": 0.069,
2116
- "step": 3100
2117
- },
2118
- {
2119
- "epoch": 1.2,
2120
- "learning_rate": 4.154278826838088e-05,
2121
- "loss": 0.2389,
2122
- "step": 3110
2123
- },
2124
- {
2125
- "epoch": 1.21,
2126
- "learning_rate": 4.134190437926879e-05,
2127
- "loss": 0.2575,
2128
- "step": 3120
2129
- },
2130
- {
2131
- "epoch": 1.21,
2132
- "learning_rate": 4.114102049015669e-05,
2133
- "loss": 0.2825,
2134
- "step": 3130
2135
- },
2136
- {
2137
- "epoch": 1.21,
2138
- "learning_rate": 4.09401366010446e-05,
2139
- "loss": 0.2574,
2140
- "step": 3140
2141
- },
2142
- {
2143
- "epoch": 1.22,
2144
- "learning_rate": 4.073925271193251e-05,
2145
- "loss": 0.2707,
2146
- "step": 3150
2147
- },
2148
- {
2149
- "epoch": 1.22,
2150
- "learning_rate": 4.053836882282041e-05,
2151
- "loss": 0.2589,
2152
- "step": 3160
2153
- },
2154
- {
2155
- "epoch": 1.22,
2156
- "learning_rate": 4.033748493370832e-05,
2157
- "loss": 0.2646,
2158
- "step": 3170
2159
- },
2160
- {
2161
- "epoch": 1.23,
2162
- "learning_rate": 4.0136601044596224e-05,
2163
- "loss": 0.2685,
2164
- "step": 3180
2165
- },
2166
- {
2167
- "epoch": 1.23,
2168
- "learning_rate": 3.993571715548413e-05,
2169
- "loss": 0.2595,
2170
- "step": 3190
2171
- },
2172
- {
2173
- "epoch": 1.24,
2174
- "learning_rate": 3.973483326637204e-05,
2175
- "loss": 0.2477,
2176
- "step": 3200
2177
- },
2178
- {
2179
- "epoch": 1.24,
2180
- "eval_loss": 0.3164760172367096,
2181
- "eval_runtime": 229.5059,
2182
- "eval_samples_per_second": 2.179,
2183
- "eval_steps_per_second": 0.07,
2184
- "step": 3200
2185
- },
2186
- {
2187
- "epoch": 1.24,
2188
- "learning_rate": 3.953394937725994e-05,
2189
- "loss": 0.2598,
2190
- "step": 3210
2191
- },
2192
- {
2193
- "epoch": 1.24,
2194
- "learning_rate": 3.9333065488147854e-05,
2195
- "loss": 0.2626,
2196
- "step": 3220
2197
- },
2198
- {
2199
- "epoch": 1.25,
2200
- "learning_rate": 3.913218159903576e-05,
2201
- "loss": 0.2538,
2202
- "step": 3230
2203
- },
2204
- {
2205
- "epoch": 1.25,
2206
- "learning_rate": 3.8931297709923666e-05,
2207
- "loss": 0.2562,
2208
- "step": 3240
2209
- },
2210
- {
2211
- "epoch": 1.26,
2212
- "learning_rate": 3.873041382081157e-05,
2213
- "loss": 0.2625,
2214
- "step": 3250
2215
- },
2216
- {
2217
- "epoch": 1.26,
2218
- "learning_rate": 3.8529529931699484e-05,
2219
- "loss": 0.2638,
2220
- "step": 3260
2221
- },
2222
- {
2223
- "epoch": 1.26,
2224
- "learning_rate": 3.8328646042587384e-05,
2225
- "loss": 0.2415,
2226
- "step": 3270
2227
- },
2228
- {
2229
- "epoch": 1.27,
2230
- "learning_rate": 3.8127762153475296e-05,
2231
- "loss": 0.2894,
2232
- "step": 3280
2233
- },
2234
- {
2235
- "epoch": 1.27,
2236
- "learning_rate": 3.79268782643632e-05,
2237
- "loss": 0.2606,
2238
- "step": 3290
2239
- },
2240
- {
2241
- "epoch": 1.27,
2242
- "learning_rate": 3.772599437525111e-05,
2243
- "loss": 0.2607,
2244
- "step": 3300
2245
- },
2246
- {
2247
- "epoch": 1.27,
2248
- "eval_loss": 0.3150809109210968,
2249
- "eval_runtime": 231.3186,
2250
- "eval_samples_per_second": 2.162,
2251
- "eval_steps_per_second": 0.069,
2252
- "step": 3300
2253
- },
2254
- {
2255
- "epoch": 1.28,
2256
- "learning_rate": 3.7525110486139014e-05,
2257
- "loss": 0.2853,
2258
- "step": 3310
2259
- },
2260
- {
2261
- "epoch": 1.28,
2262
- "learning_rate": 3.732422659702692e-05,
2263
- "loss": 0.2597,
2264
- "step": 3320
2265
- },
2266
- {
2267
- "epoch": 1.29,
2268
- "learning_rate": 3.7123342707914825e-05,
2269
- "loss": 0.2653,
2270
- "step": 3330
2271
- },
2272
- {
2273
- "epoch": 1.29,
2274
- "learning_rate": 3.692245881880274e-05,
2275
- "loss": 0.2626,
2276
- "step": 3340
2277
- },
2278
- {
2279
- "epoch": 1.29,
2280
- "learning_rate": 3.672157492969064e-05,
2281
- "loss": 0.2465,
2282
- "step": 3350
2283
- },
2284
- {
2285
- "epoch": 1.3,
2286
- "learning_rate": 3.652069104057855e-05,
2287
- "loss": 0.2997,
2288
- "step": 3360
2289
- },
2290
- {
2291
- "epoch": 1.3,
2292
- "learning_rate": 3.6319807151466455e-05,
2293
- "loss": 0.2614,
2294
- "step": 3370
2295
- },
2296
- {
2297
- "epoch": 1.31,
2298
- "learning_rate": 3.611892326235436e-05,
2299
- "loss": 0.2708,
2300
- "step": 3380
2301
- },
2302
- {
2303
- "epoch": 1.31,
2304
- "learning_rate": 3.591803937324227e-05,
2305
- "loss": 0.2633,
2306
- "step": 3390
2307
- },
2308
- {
2309
- "epoch": 1.31,
2310
- "learning_rate": 3.571715548413018e-05,
2311
- "loss": 0.2607,
2312
- "step": 3400
2313
- },
2314
- {
2315
- "epoch": 1.31,
2316
- "eval_loss": 0.309326171875,
2317
- "eval_runtime": 234.7514,
2318
- "eval_samples_per_second": 2.13,
2319
- "eval_steps_per_second": 0.068,
2320
- "step": 3400
2321
- },
2322
- {
2323
- "epoch": 1.32,
2324
- "learning_rate": 3.551627159501808e-05,
2325
- "loss": 0.2605,
2326
- "step": 3410
2327
- },
2328
- {
2329
- "epoch": 1.32,
2330
- "learning_rate": 3.531538770590599e-05,
2331
- "loss": 0.2557,
2332
- "step": 3420
2333
- },
2334
- {
2335
- "epoch": 1.32,
2336
- "learning_rate": 3.511450381679389e-05,
2337
- "loss": 0.2566,
2338
- "step": 3430
2339
- },
2340
- {
2341
- "epoch": 1.33,
2342
- "learning_rate": 3.49136199276818e-05,
2343
- "loss": 0.2502,
2344
- "step": 3440
2345
- },
2346
- {
2347
- "epoch": 1.33,
2348
- "learning_rate": 3.471273603856971e-05,
2349
- "loss": 0.2512,
2350
- "step": 3450
2351
- },
2352
- {
2353
- "epoch": 1.34,
2354
- "learning_rate": 3.4511852149457614e-05,
2355
- "loss": 0.2966,
2356
- "step": 3460
2357
- },
2358
- {
2359
- "epoch": 1.34,
2360
- "learning_rate": 3.431096826034552e-05,
2361
- "loss": 0.2408,
2362
- "step": 3470
2363
- },
2364
- {
2365
- "epoch": 1.34,
2366
- "learning_rate": 3.411008437123343e-05,
2367
- "loss": 0.2667,
2368
- "step": 3480
2369
- },
2370
- {
2371
- "epoch": 1.35,
2372
- "learning_rate": 3.390920048212133e-05,
2373
- "loss": 0.2851,
2374
- "step": 3490
2375
- },
2376
- {
2377
- "epoch": 1.35,
2378
- "learning_rate": 3.3708316593009245e-05,
2379
- "loss": 0.2488,
2380
- "step": 3500
2381
- },
2382
- {
2383
- "epoch": 1.35,
2384
- "eval_loss": 0.3107561469078064,
2385
- "eval_runtime": 232.6614,
2386
- "eval_samples_per_second": 2.149,
2387
- "eval_steps_per_second": 0.069,
2388
- "step": 3500
2389
- },
2390
- {
2391
- "epoch": 1.36,
2392
- "learning_rate": 3.350743270389715e-05,
2393
- "loss": 0.2942,
2394
- "step": 3510
2395
- },
2396
- {
2397
- "epoch": 1.36,
2398
- "learning_rate": 3.3306548814785056e-05,
2399
- "loss": 0.2605,
2400
- "step": 3520
2401
- },
2402
- {
2403
- "epoch": 1.36,
2404
- "learning_rate": 3.310566492567296e-05,
2405
- "loss": 0.2352,
2406
- "step": 3530
2407
- },
2408
- {
2409
- "epoch": 1.37,
2410
- "learning_rate": 3.290478103656087e-05,
2411
- "loss": 0.2371,
2412
- "step": 3540
2413
- },
2414
- {
2415
- "epoch": 1.37,
2416
- "learning_rate": 3.2703897147448774e-05,
2417
- "loss": 0.2597,
2418
- "step": 3550
2419
- },
2420
- {
2421
- "epoch": 1.38,
2422
- "learning_rate": 3.2503013258336686e-05,
2423
- "loss": 0.2452,
2424
- "step": 3560
2425
- },
2426
- {
2427
- "epoch": 1.38,
2428
- "learning_rate": 3.2302129369224585e-05,
2429
- "loss": 0.2617,
2430
- "step": 3570
2431
- },
2432
- {
2433
- "epoch": 1.38,
2434
- "learning_rate": 3.21012454801125e-05,
2435
- "loss": 0.2664,
2436
- "step": 3580
2437
- },
2438
- {
2439
- "epoch": 1.39,
2440
- "learning_rate": 3.1900361591000404e-05,
2441
- "loss": 0.28,
2442
- "step": 3590
2443
- },
2444
- {
2445
- "epoch": 1.39,
2446
- "learning_rate": 3.169947770188831e-05,
2447
- "loss": 0.2376,
2448
- "step": 3600
2449
- },
2450
- {
2451
- "epoch": 1.39,
2452
- "eval_loss": 0.3105817437171936,
2453
- "eval_runtime": 232.1998,
2454
- "eval_samples_per_second": 2.153,
2455
- "eval_steps_per_second": 0.069,
2456
- "step": 3600
2457
- },
2458
- {
2459
- "epoch": 1.39,
2460
- "learning_rate": 3.1498593812776215e-05,
2461
- "loss": 0.2667,
2462
- "step": 3610
2463
- },
2464
- {
2465
- "epoch": 1.4,
2466
- "learning_rate": 3.129770992366413e-05,
2467
- "loss": 0.2599,
2468
- "step": 3620
2469
- },
2470
- {
2471
- "epoch": 1.4,
2472
- "learning_rate": 3.109682603455203e-05,
2473
- "loss": 0.2656,
2474
- "step": 3630
2475
- },
2476
- {
2477
- "epoch": 1.41,
2478
- "learning_rate": 3.089594214543994e-05,
2479
- "loss": 0.2311,
2480
- "step": 3640
2481
- },
2482
- {
2483
- "epoch": 1.41,
2484
- "learning_rate": 3.0695058256327845e-05,
2485
- "loss": 0.2506,
2486
- "step": 3650
2487
- },
2488
- {
2489
- "epoch": 1.41,
2490
- "learning_rate": 3.049417436721575e-05,
2491
- "loss": 0.2598,
2492
- "step": 3660
2493
- },
2494
- {
2495
- "epoch": 1.42,
2496
- "learning_rate": 3.0293290478103657e-05,
2497
- "loss": 0.2654,
2498
- "step": 3670
2499
- },
2500
- {
2501
- "epoch": 1.42,
2502
- "learning_rate": 3.0092406588991563e-05,
2503
- "loss": 0.2454,
2504
- "step": 3680
2505
- },
2506
- {
2507
- "epoch": 1.43,
2508
- "learning_rate": 2.9891522699879472e-05,
2509
- "loss": 0.2513,
2510
- "step": 3690
2511
- },
2512
- {
2513
- "epoch": 1.43,
2514
- "learning_rate": 2.9690638810767378e-05,
2515
- "loss": 0.2642,
2516
- "step": 3700
2517
- },
2518
- {
2519
- "epoch": 1.43,
2520
- "eval_loss": 0.3145228922367096,
2521
- "eval_runtime": 240.0973,
2522
- "eval_samples_per_second": 2.082,
2523
- "eval_steps_per_second": 0.067,
2524
- "step": 3700
2525
- },
2526
- {
2527
- "epoch": 1.43,
2528
- "learning_rate": 2.9489754921655284e-05,
2529
- "loss": 0.2386,
2530
- "step": 3710
2531
- },
2532
- {
2533
- "epoch": 1.44,
2534
- "learning_rate": 2.9288871032543193e-05,
2535
- "loss": 0.2449,
2536
- "step": 3720
2537
- },
2538
- {
2539
- "epoch": 1.44,
2540
- "learning_rate": 2.90879871434311e-05,
2541
- "loss": 0.2581,
2542
- "step": 3730
2543
- },
2544
- {
2545
- "epoch": 1.44,
2546
- "learning_rate": 2.8887103254319005e-05,
2547
- "loss": 0.25,
2548
- "step": 3740
2549
- },
2550
- {
2551
- "epoch": 1.45,
2552
- "learning_rate": 2.8686219365206914e-05,
2553
- "loss": 0.2552,
2554
- "step": 3750
2555
- },
2556
- {
2557
- "epoch": 1.45,
2558
- "learning_rate": 2.848533547609482e-05,
2559
- "loss": 0.2819,
2560
- "step": 3760
2561
- },
2562
- {
2563
- "epoch": 1.46,
2564
- "learning_rate": 2.8284451586982725e-05,
2565
- "loss": 0.2491,
2566
- "step": 3770
2567
- },
2568
- {
2569
- "epoch": 1.46,
2570
- "learning_rate": 2.8083567697870635e-05,
2571
- "loss": 0.2582,
2572
- "step": 3780
2573
- },
2574
- {
2575
- "epoch": 1.46,
2576
- "learning_rate": 2.7882683808758537e-05,
2577
- "loss": 0.2479,
2578
- "step": 3790
2579
- },
2580
- {
2581
- "epoch": 1.47,
2582
- "learning_rate": 2.7681799919646446e-05,
2583
- "loss": 0.2686,
2584
- "step": 3800
2585
- },
2586
- {
2587
- "epoch": 1.47,
2588
- "eval_loss": 0.3105817437171936,
2589
- "eval_runtime": 229.2876,
2590
- "eval_samples_per_second": 2.181,
2591
- "eval_steps_per_second": 0.07,
2592
- "step": 3800
2593
- },
2594
- {
2595
- "epoch": 1.47,
2596
- "learning_rate": 2.7480916030534355e-05,
2597
- "loss": 0.265,
2598
- "step": 3810
2599
- },
2600
- {
2601
- "epoch": 1.48,
2602
- "learning_rate": 2.7280032141422258e-05,
2603
- "loss": 0.2557,
2604
- "step": 3820
2605
- },
2606
- {
2607
- "epoch": 1.48,
2608
- "learning_rate": 2.7079148252310167e-05,
2609
- "loss": 0.2458,
2610
- "step": 3830
2611
- },
2612
- {
2613
- "epoch": 1.48,
2614
- "learning_rate": 2.6878264363198073e-05,
2615
- "loss": 0.2683,
2616
- "step": 3840
2617
- },
2618
- {
2619
- "epoch": 1.49,
2620
- "learning_rate": 2.667738047408598e-05,
2621
- "loss": 0.2494,
2622
- "step": 3850
2623
- },
2624
- {
2625
- "epoch": 1.49,
2626
- "learning_rate": 2.6476496584973888e-05,
2627
- "loss": 0.2631,
2628
- "step": 3860
2629
- },
2630
- {
2631
- "epoch": 1.49,
2632
- "learning_rate": 2.6275612695861794e-05,
2633
- "loss": 0.256,
2634
- "step": 3870
2635
- },
2636
- {
2637
- "epoch": 1.5,
2638
- "learning_rate": 2.60747288067497e-05,
2639
- "loss": 0.2686,
2640
- "step": 3880
2641
- },
2642
- {
2643
- "epoch": 1.5,
2644
- "learning_rate": 2.587384491763761e-05,
2645
- "loss": 0.2562,
2646
- "step": 3890
2647
- },
2648
- {
2649
- "epoch": 1.51,
2650
- "learning_rate": 2.5672961028525515e-05,
2651
- "loss": 0.2474,
2652
- "step": 3900
2653
- },
2654
- {
2655
- "epoch": 1.51,
2656
- "eval_loss": 0.3085588812828064,
2657
- "eval_runtime": 234.5548,
2658
- "eval_samples_per_second": 2.132,
2659
- "eval_steps_per_second": 0.068,
2660
- "step": 3900
2661
- },
2662
- {
2663
- "epoch": 1.51,
2664
- "learning_rate": 2.547207713941342e-05,
2665
- "loss": 0.2615,
2666
- "step": 3910
2667
- },
2668
- {
2669
- "epoch": 1.51,
2670
- "learning_rate": 2.527119325030133e-05,
2671
- "loss": 0.2588,
2672
- "step": 3920
2673
- },
2674
- {
2675
- "epoch": 1.52,
2676
- "learning_rate": 2.5070309361189232e-05,
2677
- "loss": 0.2585,
2678
- "step": 3930
2679
- },
2680
- {
2681
- "epoch": 1.52,
2682
- "learning_rate": 2.486942547207714e-05,
2683
- "loss": 0.2703,
2684
- "step": 3940
2685
- },
2686
- {
2687
- "epoch": 1.53,
2688
- "learning_rate": 2.4668541582965047e-05,
2689
- "loss": 0.2341,
2690
- "step": 3950
2691
- },
2692
- {
2693
- "epoch": 1.53,
2694
- "learning_rate": 2.4467657693852956e-05,
2695
- "loss": 0.2556,
2696
- "step": 3960
2697
- },
2698
- {
2699
- "epoch": 1.53,
2700
- "learning_rate": 2.4266773804740862e-05,
2701
- "loss": 0.2621,
2702
- "step": 3970
2703
- },
2704
- {
2705
- "epoch": 1.54,
2706
- "learning_rate": 2.4065889915628768e-05,
2707
- "loss": 0.2614,
2708
- "step": 3980
2709
- },
2710
- {
2711
- "epoch": 1.54,
2712
- "learning_rate": 2.3865006026516677e-05,
2713
- "loss": 0.2531,
2714
- "step": 3990
2715
- },
2716
- {
2717
- "epoch": 1.54,
2718
- "learning_rate": 2.3664122137404583e-05,
2719
- "loss": 0.265,
2720
- "step": 4000
2721
- },
2722
- {
2723
- "epoch": 1.54,
2724
- "eval_loss": 0.3104073703289032,
2725
- "eval_runtime": 234.8987,
2726
- "eval_samples_per_second": 2.129,
2727
- "eval_steps_per_second": 0.068,
2728
- "step": 4000
2729
- },
2730
- {
2731
- "epoch": 1.55,
2732
- "learning_rate": 2.346323824829249e-05,
2733
- "loss": 0.2684,
2734
- "step": 4010
2735
- },
2736
- {
2737
- "epoch": 1.55,
2738
- "learning_rate": 2.3262354359180395e-05,
2739
- "loss": 0.2399,
2740
- "step": 4020
2741
- },
2742
- {
2743
- "epoch": 1.56,
2744
- "learning_rate": 2.3061470470068304e-05,
2745
- "loss": 0.2632,
2746
- "step": 4030
2747
- },
2748
- {
2749
- "epoch": 1.56,
2750
- "learning_rate": 2.286058658095621e-05,
2751
- "loss": 0.2659,
2752
- "step": 4040
2753
- },
2754
- {
2755
- "epoch": 1.56,
2756
- "learning_rate": 2.2659702691844116e-05,
2757
- "loss": 0.2662,
2758
- "step": 4050
2759
- },
2760
- {
2761
- "epoch": 1.57,
2762
- "learning_rate": 2.245881880273202e-05,
2763
- "loss": 0.24,
2764
- "step": 4060
2765
- },
2766
- {
2767
- "epoch": 1.57,
2768
- "learning_rate": 2.225793491361993e-05,
2769
- "loss": 0.248,
2770
- "step": 4070
2771
- },
2772
- {
2773
- "epoch": 1.58,
2774
- "learning_rate": 2.2057051024507836e-05,
2775
- "loss": 0.2675,
2776
- "step": 4080
2777
- },
2778
- {
2779
- "epoch": 1.58,
2780
- "learning_rate": 2.1856167135395742e-05,
2781
- "loss": 0.263,
2782
- "step": 4090
2783
- },
2784
- {
2785
- "epoch": 1.58,
2786
- "learning_rate": 2.165528324628365e-05,
2787
- "loss": 0.259,
2788
- "step": 4100
2789
- },
2790
- {
2791
- "epoch": 1.58,
2792
- "eval_loss": 0.3070940375328064,
2793
- "eval_runtime": 232.4523,
2794
- "eval_samples_per_second": 2.151,
2795
- "eval_steps_per_second": 0.069,
2796
- "step": 4100
2797
- },
2798
- {
2799
- "epoch": 1.59,
2800
- "learning_rate": 2.1454399357171557e-05,
2801
- "loss": 0.2708,
2802
- "step": 4110
2803
- },
2804
- {
2805
- "epoch": 1.59,
2806
- "learning_rate": 2.1253515468059463e-05,
2807
- "loss": 0.2591,
2808
- "step": 4120
2809
- },
2810
- {
2811
- "epoch": 1.6,
2812
- "learning_rate": 2.105263157894737e-05,
2813
- "loss": 0.2451,
2814
- "step": 4130
2815
- },
2816
- {
2817
- "epoch": 1.6,
2818
- "learning_rate": 2.0851747689835278e-05,
2819
- "loss": 0.2636,
2820
- "step": 4140
2821
- },
2822
- {
2823
- "epoch": 1.6,
2824
- "learning_rate": 2.0650863800723184e-05,
2825
- "loss": 0.2754,
2826
- "step": 4150
2827
- },
2828
- {
2829
- "epoch": 1.61,
2830
- "learning_rate": 2.044997991161109e-05,
2831
- "loss": 0.2639,
2832
- "step": 4160
2833
- },
2834
- {
2835
- "epoch": 1.61,
2836
- "learning_rate": 2.0249096022499e-05,
2837
- "loss": 0.258,
2838
- "step": 4170
2839
- },
2840
- {
2841
- "epoch": 1.61,
2842
- "learning_rate": 2.0048212133386905e-05,
2843
- "loss": 0.2585,
2844
- "step": 4180
2845
- },
2846
- {
2847
- "epoch": 1.62,
2848
- "learning_rate": 1.984732824427481e-05,
2849
- "loss": 0.2735,
2850
- "step": 4190
2851
- },
2852
- {
2853
- "epoch": 1.62,
2854
- "learning_rate": 1.9646444355162716e-05,
2855
- "loss": 0.2529,
2856
- "step": 4200
2857
- },
2858
- {
2859
- "epoch": 1.62,
2860
- "eval_loss": 0.3050014078617096,
2861
- "eval_runtime": 231.7346,
2862
- "eval_samples_per_second": 2.158,
2863
- "eval_steps_per_second": 0.069,
2864
- "step": 4200
2865
- },
2866
- {
2867
- "epoch": 1.63,
2868
- "learning_rate": 1.9445560466050626e-05,
2869
- "loss": 0.2392,
2870
- "step": 4210
2871
- },
2872
- {
2873
- "epoch": 1.63,
2874
- "learning_rate": 1.924467657693853e-05,
2875
- "loss": 0.2495,
2876
- "step": 4220
2877
- },
2878
- {
2879
- "epoch": 1.63,
2880
- "learning_rate": 1.9043792687826437e-05,
2881
- "loss": 0.2435,
2882
- "step": 4230
2883
- },
2884
- {
2885
- "epoch": 1.64,
2886
- "learning_rate": 1.8842908798714343e-05,
2887
- "loss": 0.2352,
2888
- "step": 4240
2889
- },
2890
- {
2891
- "epoch": 1.64,
2892
- "learning_rate": 1.8642024909602252e-05,
2893
- "loss": 0.2506,
2894
- "step": 4250
2895
- },
2896
- {
2897
- "epoch": 1.65,
2898
- "learning_rate": 1.8441141020490158e-05,
2899
- "loss": 0.2384,
2900
- "step": 4260
2901
- },
2902
- {
2903
- "epoch": 1.65,
2904
- "learning_rate": 1.8240257131378064e-05,
2905
- "loss": 0.256,
2906
- "step": 4270
2907
- },
2908
- {
2909
- "epoch": 1.65,
2910
- "learning_rate": 1.8039373242265973e-05,
2911
- "loss": 0.255,
2912
- "step": 4280
2913
- },
2914
- {
2915
- "epoch": 1.66,
2916
- "learning_rate": 1.783848935315388e-05,
2917
- "loss": 0.2645,
2918
- "step": 4290
2919
- },
2920
- {
2921
- "epoch": 1.66,
2922
- "learning_rate": 1.7637605464041785e-05,
2923
- "loss": 0.2624,
2924
- "step": 4300
2925
- },
2926
- {
2927
- "epoch": 1.66,
2928
- "eval_loss": 0.308837890625,
2929
- "eval_runtime": 236.1172,
2930
- "eval_samples_per_second": 2.118,
2931
- "eval_steps_per_second": 0.068,
2932
- "step": 4300
2933
- },
2934
- {
2935
- "epoch": 1.66,
2936
- "learning_rate": 1.743672157492969e-05,
2937
- "loss": 0.2557,
2938
- "step": 4310
2939
- },
2940
- {
2941
- "epoch": 1.67,
2942
- "learning_rate": 1.72358376858176e-05,
2943
- "loss": 0.2391,
2944
- "step": 4320
2945
- },
2946
- {
2947
- "epoch": 1.67,
2948
- "learning_rate": 1.7034953796705506e-05,
2949
- "loss": 0.2361,
2950
- "step": 4330
2951
- },
2952
- {
2953
- "epoch": 1.68,
2954
- "learning_rate": 1.683406990759341e-05,
2955
- "loss": 0.2389,
2956
- "step": 4340
2957
- },
2958
- {
2959
- "epoch": 1.68,
2960
- "learning_rate": 1.663318601848132e-05,
2961
- "loss": 0.2491,
2962
- "step": 4350
2963
- },
2964
- {
2965
- "epoch": 1.68,
2966
- "learning_rate": 1.6432302129369227e-05,
2967
- "loss": 0.2556,
2968
- "step": 4360
2969
- },
2970
- {
2971
- "epoch": 1.69,
2972
- "learning_rate": 1.6231418240257132e-05,
2973
- "loss": 0.2963,
2974
- "step": 4370
2975
- },
2976
- {
2977
- "epoch": 1.69,
2978
- "learning_rate": 1.6030534351145038e-05,
2979
- "loss": 0.2571,
2980
- "step": 4380
2981
- },
2982
- {
2983
- "epoch": 1.7,
2984
- "learning_rate": 1.5829650462032947e-05,
2985
- "loss": 0.2707,
2986
- "step": 4390
2987
- },
2988
- {
2989
- "epoch": 1.7,
2990
- "learning_rate": 1.5628766572920853e-05,
2991
- "loss": 0.2513,
2992
- "step": 4400
2993
- },
2994
- {
2995
- "epoch": 1.7,
2996
- "eval_loss": 0.3080357015132904,
2997
- "eval_runtime": 231.5394,
2998
- "eval_samples_per_second": 2.159,
2999
- "eval_steps_per_second": 0.069,
3000
- "step": 4400
3001
- },
3002
- {
3003
- "epoch": 1.7,
3004
- "learning_rate": 1.542788268380876e-05,
3005
- "loss": 0.2542,
3006
- "step": 4410
3007
- },
3008
- {
3009
- "epoch": 1.71,
3010
- "learning_rate": 1.5226998794696665e-05,
3011
- "loss": 0.2576,
3012
- "step": 4420
3013
- },
3014
- {
3015
- "epoch": 1.71,
3016
- "learning_rate": 1.5026114905584574e-05,
3017
- "loss": 0.2314,
3018
- "step": 4430
3019
- },
3020
- {
3021
- "epoch": 1.71,
3022
- "learning_rate": 1.482523101647248e-05,
3023
- "loss": 0.242,
3024
- "step": 4440
3025
- },
3026
- {
3027
- "epoch": 1.72,
3028
- "learning_rate": 1.4624347127360386e-05,
3029
- "loss": 0.2599,
3030
- "step": 4450
3031
- },
3032
- {
3033
- "epoch": 1.72,
3034
- "learning_rate": 1.4423463238248295e-05,
3035
- "loss": 0.267,
3036
- "step": 4460
3037
- },
3038
- {
3039
- "epoch": 1.73,
3040
- "learning_rate": 1.42225793491362e-05,
3041
- "loss": 0.2508,
3042
- "step": 4470
3043
- },
3044
- {
3045
- "epoch": 1.73,
3046
- "learning_rate": 1.4021695460024107e-05,
3047
- "loss": 0.2845,
3048
- "step": 4480
3049
- },
3050
- {
3051
- "epoch": 1.73,
3052
- "learning_rate": 1.3820811570912012e-05,
3053
- "loss": 0.2647,
3054
- "step": 4490
3055
- },
3056
- {
3057
- "epoch": 1.74,
3058
- "learning_rate": 1.3619927681799922e-05,
3059
- "loss": 0.2667,
3060
- "step": 4500
3061
- },
3062
- {
3063
- "epoch": 1.74,
3064
- "eval_loss": 0.3072335422039032,
3065
- "eval_runtime": 231.7522,
3066
- "eval_samples_per_second": 2.157,
3067
- "eval_steps_per_second": 0.069,
3068
- "step": 4500
3069
- },
3070
- {
3071
- "epoch": 1.74,
3072
- "learning_rate": 1.3419043792687827e-05,
3073
- "loss": 0.2759,
3074
- "step": 4510
3075
- },
3076
- {
3077
- "epoch": 1.75,
3078
- "learning_rate": 1.3218159903575733e-05,
3079
- "loss": 0.2618,
3080
- "step": 4520
3081
- },
3082
- {
3083
- "epoch": 1.75,
3084
- "learning_rate": 1.3017276014463642e-05,
3085
- "loss": 0.2465,
3086
- "step": 4530
3087
- },
3088
- {
3089
- "epoch": 1.75,
3090
- "learning_rate": 1.2816392125351548e-05,
3091
- "loss": 0.2601,
3092
- "step": 4540
3093
- },
3094
- {
3095
- "epoch": 1.76,
3096
- "learning_rate": 1.2615508236239454e-05,
3097
- "loss": 0.2567,
3098
- "step": 4550
3099
- },
3100
- {
3101
- "epoch": 1.76,
3102
- "learning_rate": 1.2414624347127362e-05,
3103
- "loss": 0.2366,
3104
- "step": 4560
3105
- },
3106
- {
3107
- "epoch": 1.77,
3108
- "learning_rate": 1.2213740458015267e-05,
3109
- "loss": 0.2563,
3110
- "step": 4570
3111
- },
3112
- {
3113
- "epoch": 1.77,
3114
- "learning_rate": 1.2012856568903175e-05,
3115
- "loss": 0.2599,
3116
- "step": 4580
3117
- },
3118
- {
3119
- "epoch": 1.77,
3120
- "learning_rate": 1.181197267979108e-05,
3121
- "loss": 0.2677,
3122
- "step": 4590
3123
- },
3124
- {
3125
- "epoch": 1.78,
3126
- "learning_rate": 1.1611088790678988e-05,
3127
- "loss": 0.2559,
3128
- "step": 4600
3129
- },
3130
- {
3131
- "epoch": 1.78,
3132
- "eval_loss": 0.3053501546382904,
3133
- "eval_runtime": 227.8046,
3134
- "eval_samples_per_second": 2.195,
3135
- "eval_steps_per_second": 0.07,
3136
- "step": 4600
3137
- },
3138
- {
3139
- "epoch": 1.78,
3140
- "learning_rate": 1.1410204901566896e-05,
3141
- "loss": 0.2499,
3142
- "step": 4610
3143
- },
3144
- {
3145
- "epoch": 1.78,
3146
- "learning_rate": 1.1209321012454802e-05,
3147
- "loss": 0.27,
3148
- "step": 4620
3149
- },
3150
- {
3151
- "epoch": 1.79,
3152
- "learning_rate": 1.1008437123342709e-05,
3153
- "loss": 0.2661,
3154
- "step": 4630
3155
- },
3156
- {
3157
- "epoch": 1.79,
3158
- "learning_rate": 1.0807553234230615e-05,
3159
- "loss": 0.2456,
3160
- "step": 4640
3161
- },
3162
- {
3163
- "epoch": 1.8,
3164
- "learning_rate": 1.0606669345118522e-05,
3165
- "loss": 0.2711,
3166
- "step": 4650
3167
- },
3168
- {
3169
- "epoch": 1.8,
3170
- "learning_rate": 1.0405785456006428e-05,
3171
- "loss": 0.2593,
3172
- "step": 4660
3173
- },
3174
- {
3175
- "epoch": 1.8,
3176
- "learning_rate": 1.0204901566894336e-05,
3177
- "loss": 0.2536,
3178
- "step": 4670
3179
- },
3180
- {
3181
- "epoch": 1.81,
3182
- "learning_rate": 1.0004017677782242e-05,
3183
- "loss": 0.2514,
3184
- "step": 4680
3185
- },
3186
- {
3187
- "epoch": 1.81,
3188
- "learning_rate": 9.803133788670149e-06,
3189
- "loss": 0.2489,
3190
- "step": 4690
3191
- },
3192
- {
3193
- "epoch": 1.82,
3194
- "learning_rate": 9.602249899558057e-06,
3195
- "loss": 0.2238,
3196
- "step": 4700
3197
- },
3198
- {
3199
- "epoch": 1.82,
3200
- "eval_loss": 0.3055942952632904,
3201
- "eval_runtime": 229.6273,
3202
- "eval_samples_per_second": 2.177,
3203
- "eval_steps_per_second": 0.07,
3204
- "step": 4700
3205
- },
3206
- {
3207
- "epoch": 1.82,
3208
- "learning_rate": 9.401366010445962e-06,
3209
- "loss": 0.2482,
3210
- "step": 4710
3211
- },
3212
- {
3213
- "epoch": 1.82,
3214
- "learning_rate": 9.20048212133387e-06,
3215
- "loss": 0.2443,
3216
- "step": 4720
3217
- },
3218
- {
3219
- "epoch": 1.83,
3220
- "learning_rate": 8.999598232221776e-06,
3221
- "loss": 0.2386,
3222
- "step": 4730
3223
- },
3224
- {
3225
- "epoch": 1.83,
3226
- "learning_rate": 8.798714343109683e-06,
3227
- "loss": 0.2502,
3228
- "step": 4740
3229
- },
3230
- {
3231
- "epoch": 1.83,
3232
- "learning_rate": 8.597830453997589e-06,
3233
- "loss": 0.2552,
3234
- "step": 4750
3235
- },
3236
- {
3237
- "epoch": 1.84,
3238
- "learning_rate": 8.396946564885497e-06,
3239
- "loss": 0.2473,
3240
- "step": 4760
3241
- },
3242
- {
3243
- "epoch": 1.84,
3244
- "learning_rate": 8.196062675773402e-06,
3245
- "loss": 0.249,
3246
- "step": 4770
3247
- },
3248
- {
3249
- "epoch": 1.85,
3250
- "learning_rate": 7.99517878666131e-06,
3251
- "loss": 0.2373,
3252
- "step": 4780
3253
- },
3254
- {
3255
- "epoch": 1.85,
3256
- "learning_rate": 7.794294897549218e-06,
3257
- "loss": 0.2211,
3258
- "step": 4790
3259
- },
3260
- {
3261
- "epoch": 1.85,
3262
- "learning_rate": 7.593411008437123e-06,
3263
- "loss": 0.2686,
3264
- "step": 4800
3265
- },
3266
- {
3267
- "epoch": 1.85,
3268
- "eval_loss": 0.3053850531578064,
3269
- "eval_runtime": 231.0424,
3270
- "eval_samples_per_second": 2.164,
3271
- "eval_steps_per_second": 0.069,
3272
- "step": 4800
3273
- },
3274
- {
3275
- "epoch": 1.86,
3276
- "learning_rate": 7.392527119325031e-06,
3277
- "loss": 0.2641,
3278
- "step": 4810
3279
- },
3280
- {
3281
- "epoch": 1.86,
3282
- "learning_rate": 7.191643230212937e-06,
3283
- "loss": 0.2389,
3284
- "step": 4820
3285
- },
3286
- {
3287
- "epoch": 1.87,
3288
- "learning_rate": 6.990759341100844e-06,
3289
- "loss": 0.2573,
3290
- "step": 4830
3291
- },
3292
- {
3293
- "epoch": 1.87,
3294
- "learning_rate": 6.78987545198875e-06,
3295
- "loss": 0.2335,
3296
- "step": 4840
3297
- },
3298
- {
3299
- "epoch": 1.87,
3300
- "learning_rate": 6.5889915628766575e-06,
3301
- "loss": 0.2724,
3302
- "step": 4850
3303
- },
3304
- {
3305
- "epoch": 1.88,
3306
- "learning_rate": 6.388107673764563e-06,
3307
- "loss": 0.2552,
3308
- "step": 4860
3309
- },
3310
- {
3311
- "epoch": 1.88,
3312
- "learning_rate": 6.187223784652471e-06,
3313
- "loss": 0.2422,
3314
- "step": 4870
3315
- },
3316
- {
3317
- "epoch": 1.88,
3318
- "learning_rate": 5.9863398955403775e-06,
3319
- "loss": 0.252,
3320
- "step": 4880
3321
- },
3322
- {
3323
- "epoch": 1.89,
3324
- "learning_rate": 5.785456006428284e-06,
3325
- "loss": 0.2619,
3326
- "step": 4890
3327
- },
3328
- {
3329
- "epoch": 1.89,
3330
- "learning_rate": 5.584572117316191e-06,
3331
- "loss": 0.2684,
3332
- "step": 4900
3333
- },
3334
- {
3335
- "epoch": 1.89,
3336
- "eval_loss": 0.3049665093421936,
3337
- "eval_runtime": 231.0605,
3338
- "eval_samples_per_second": 2.164,
3339
- "eval_steps_per_second": 0.069,
3340
- "step": 4900
3341
- },
3342
- {
3343
- "epoch": 1.9,
3344
- "learning_rate": 5.383688228204098e-06,
3345
- "loss": 0.2503,
3346
- "step": 4910
3347
- },
3348
- {
3349
- "epoch": 1.9,
3350
- "learning_rate": 5.182804339092005e-06,
3351
- "loss": 0.2581,
3352
- "step": 4920
3353
- },
3354
- {
3355
- "epoch": 1.9,
3356
- "learning_rate": 4.981920449979912e-06,
3357
- "loss": 0.2445,
3358
- "step": 4930
3359
- },
3360
- {
3361
- "epoch": 1.91,
3362
- "learning_rate": 4.781036560867818e-06,
3363
- "loss": 0.2527,
3364
- "step": 4940
3365
- },
3366
- {
3367
- "epoch": 1.91,
3368
- "learning_rate": 4.580152671755725e-06,
3369
- "loss": 0.2579,
3370
- "step": 4950
3371
- },
3372
- {
3373
- "epoch": 1.92,
3374
- "learning_rate": 4.379268782643632e-06,
3375
- "loss": 0.2558,
3376
- "step": 4960
3377
- },
3378
- {
3379
- "epoch": 1.92,
3380
- "learning_rate": 4.178384893531538e-06,
3381
- "loss": 0.2598,
3382
- "step": 4970
3383
- },
3384
- {
3385
- "epoch": 1.92,
3386
- "learning_rate": 3.977501004419446e-06,
3387
- "loss": 0.231,
3388
- "step": 4980
3389
- },
3390
- {
3391
- "epoch": 1.93,
3392
- "learning_rate": 3.776617115307352e-06,
3393
- "loss": 0.2665,
3394
- "step": 4990
3395
- },
3396
- {
3397
- "epoch": 1.93,
3398
- "learning_rate": 3.5757332261952597e-06,
3399
- "loss": 0.247,
3400
- "step": 5000
3401
- },
3402
- {
3403
- "epoch": 1.93,
3404
- "eval_loss": 0.3040597140789032,
3405
- "eval_runtime": 233.9026,
3406
- "eval_samples_per_second": 2.138,
3407
- "eval_steps_per_second": 0.068,
3408
- "step": 5000
3409
- },
3410
- {
3411
- "epoch": 1.94,
3412
- "learning_rate": 3.3748493370831664e-06,
3413
- "loss": 0.2499,
3414
- "step": 5010
3415
- },
3416
- {
3417
- "epoch": 1.94,
3418
- "learning_rate": 3.173965447971073e-06,
3419
- "loss": 0.241,
3420
- "step": 5020
3421
- },
3422
- {
3423
- "epoch": 1.94,
3424
- "learning_rate": 2.9730815588589797e-06,
3425
- "loss": 0.2455,
3426
- "step": 5030
3427
- },
3428
- {
3429
- "epoch": 1.95,
3430
- "learning_rate": 2.7721976697468864e-06,
3431
- "loss": 0.2592,
3432
- "step": 5040
3433
- },
3434
- {
3435
- "epoch": 1.95,
3436
- "learning_rate": 2.571313780634793e-06,
3437
- "loss": 0.2829,
3438
- "step": 5050
3439
- },
3440
- {
3441
- "epoch": 1.95,
3442
- "learning_rate": 2.3704298915227e-06,
3443
- "loss": 0.251,
3444
- "step": 5060
3445
- },
3446
- {
3447
- "epoch": 1.96,
3448
- "learning_rate": 2.169546002410607e-06,
3449
- "loss": 0.2587,
3450
- "step": 5070
3451
- },
3452
- {
3453
- "epoch": 1.96,
3454
- "learning_rate": 1.9686621132985135e-06,
3455
- "loss": 0.2574,
3456
- "step": 5080
3457
- },
3458
- {
3459
- "epoch": 1.97,
3460
- "learning_rate": 1.7677782241864203e-06,
3461
- "loss": 0.2697,
3462
- "step": 5090
3463
- },
3464
- {
3465
- "epoch": 1.97,
3466
- "learning_rate": 1.566894335074327e-06,
3467
- "loss": 0.2477,
3468
- "step": 5100
3469
- },
3470
- {
3471
- "epoch": 1.97,
3472
- "eval_loss": 0.303466796875,
3473
- "eval_runtime": 231.3482,
3474
- "eval_samples_per_second": 2.161,
3475
- "eval_steps_per_second": 0.069,
3476
- "step": 5100
3477
- },
3478
- {
3479
- "epoch": 1.97,
3480
- "learning_rate": 1.3660104459622339e-06,
3481
- "loss": 0.2642,
3482
- "step": 5110
3483
- },
3484
- {
3485
- "epoch": 1.98,
3486
- "learning_rate": 1.1651265568501408e-06,
3487
- "loss": 0.2616,
3488
- "step": 5120
3489
- },
3490
- {
3491
- "epoch": 1.98,
3492
- "learning_rate": 9.642426677380474e-07,
3493
- "loss": 0.2268,
3494
- "step": 5130
3495
- },
3496
- {
3497
- "epoch": 1.99,
3498
- "learning_rate": 7.633587786259542e-07,
3499
- "loss": 0.2561,
3500
- "step": 5140
3501
- },
3502
- {
3503
- "epoch": 1.99,
3504
- "learning_rate": 5.62474889513861e-07,
3505
- "loss": 0.2544,
3506
- "step": 5150
3507
- },
3508
- {
3509
- "epoch": 1.99,
3510
- "learning_rate": 3.6159100040176776e-07,
3511
- "loss": 0.2465,
3512
- "step": 5160
3513
- },
3514
- {
3515
- "epoch": 2.0,
3516
- "learning_rate": 1.6070711128967456e-07,
3517
- "loss": 0.2721,
3518
- "step": 5170
3519
- }
3520
- ],
3521
- "max_steps": 5178,
3522
- "num_train_epochs": 2,
3523
- "total_flos": 3.628920535154426e+18,
3524
- "trial_name": null,
3525
- "trial_params": null
3526
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
owl-con/checkpoint-5178/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:966f65be3c927ae7b3f355c69d036bd4d1faf28b059ed8e0b3f7caf80b643f7f
3
- size 3707
 
 
 
 
owl-con/params.txt DELETED
@@ -1,36 +0,0 @@
1
- adam_beta1: 0.9
2
- adam_beta2: 0.999
3
- adam_eps: 1e-08
4
- all_params: True
5
- bf16: True
6
- clip_grad: 1.0
7
- ddp_find_unused_parameters: False
8
- do_train: True
9
- eval_iters: 100
10
- finetuned_ckpt: None
11
- gradient_accumulation_steps: 1
12
- gradient_checkpointing: True
13
- inference_mode: False
14
- local_rank: 0
15
- logging_nan_inf_filter: False
16
- lora_alpha: 32
17
- lora_dropout: 0.05
18
- lora_r: 32
19
- loss_objective: sequential
20
- lr: 0.0001
21
- micro_batch_size: 16
22
- min_lr: 1e-07
23
- mm_config: configs/video_mix.yaml
24
- num_training_steps: 4236
25
- num_warmup_steps: 200
26
- num_workers: 32
27
- pretrained_ckpt: /local2/hbansal/video_text/mplug-owl-llama-7b-video
28
- save_interval: None
29
- save_path: /local2/hbansal/video_text/mplugowl_wipeout_data_second_stage_lora_all_params_32_sequential_mix_1e-4/
30
- seq_length: 256
31
- train_epochs: 2
32
- train_visual_abstractor: False
33
- use_lora: True
34
- use_qv: False
35
- wandb_run_name: mplugowl_wipeout_data_second_stage_lora_all_params_32_sequential_mix_1e-4
36
- weight_decay: 0.0001