hynky HF staff commited on
Commit
c8eaa54
1 Parent(s): 3142f6c

Model save

Browse files
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  license: llama2
3
- base_model: codellama/Codellama-7b-Instruct-hf
4
  tags:
5
  - generated_from_trainer
6
  model-index:
@@ -13,9 +13,9 @@ should probably proofread and complete it, then remove this comment. -->
13
 
14
  # codellama-7b-sft-lora-func-names
15
 
16
- This model is a fine-tuned version of [codellama/Codellama-7b-Instruct-hf](https://huggingface.co/codellama/Codellama-7b-Instruct-hf) on an unknown dataset.
17
  It achieves the following results on the evaluation set:
18
- - Loss: 0.8012
19
 
20
  ## Model description
21
 
@@ -44,17 +44,17 @@ The following hyperparameters were used during training:
44
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
45
  - lr_scheduler_type: cosine
46
  - lr_scheduler_warmup_ratio: 0.1
47
- - training_steps: 960
48
 
49
  ### Training results
50
 
51
  | Training Loss | Epoch | Step | Validation Loss |
52
  |:-------------:|:-----:|:----:|:---------------:|
53
- | 0.7046 | 0.31 | 192 | 0.7329 |
54
- | 0.6521 | 0.61 | 384 | 0.7474 |
55
- | 0.5824 | 0.92 | 576 | 0.7729 |
56
- | 0.5575 | 1.23 | 768 | 0.7964 |
57
- | 0.5371 | 1.54 | 960 | 0.8012 |
58
 
59
 
60
  ### Framework versions
 
1
  ---
2
  license: llama2
3
+ base_model: codellama/CodeLlama-7b-Instruct-hf
4
  tags:
5
  - generated_from_trainer
6
  model-index:
 
13
 
14
  # codellama-7b-sft-lora-func-names
15
 
16
+ This model is a fine-tuned version of [codellama/CodeLlama-7b-Instruct-hf](https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf) on an unknown dataset.
17
  It achieves the following results on the evaluation set:
18
+ - Loss: 0.7084
19
 
20
  ## Model description
21
 
 
44
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
45
  - lr_scheduler_type: cosine
46
  - lr_scheduler_warmup_ratio: 0.1
47
+ - training_steps: 900
48
 
49
  ### Training results
50
 
51
  | Training Loss | Epoch | Step | Validation Loss |
52
  |:-------------:|:-----:|:----:|:---------------:|
53
+ | 0.7541 | 0.01 | 180 | 0.7222 |
54
+ | 0.7126 | 0.01 | 360 | 0.7118 |
55
+ | 0.7342 | 0.02 | 540 | 0.7100 |
56
+ | 0.7216 | 0.03 | 720 | 0.7083 |
57
+ | 0.7171 | 0.04 | 900 | 0.7084 |
58
 
59
 
60
  ### Framework versions
adapter_config.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "alpha_pattern": {},
3
  "auto_mapping": null,
4
- "base_model_name_or_path": "codellama/Codellama-7b-Instruct-hf",
5
  "bias": "none",
6
  "fan_in_fan_out": false,
7
  "inference_mode": true,
@@ -16,10 +16,10 @@
16
  "rank_pattern": {},
17
  "revision": null,
18
  "target_modules": [
 
19
  "q_proj",
20
- "k_proj",
21
  "o_proj",
22
- "v_proj"
23
  ],
24
  "task_type": "CAUSAL_LM"
25
  }
 
1
  {
2
  "alpha_pattern": {},
3
  "auto_mapping": null,
4
+ "base_model_name_or_path": "codellama/CodeLlama-7b-Instruct-hf",
5
  "bias": "none",
6
  "fan_in_fan_out": false,
7
  "inference_mode": true,
 
16
  "rank_pattern": {},
17
  "revision": null,
18
  "target_modules": [
19
+ "v_proj",
20
  "q_proj",
 
21
  "o_proj",
22
+ "k_proj"
23
  ],
24
  "task_type": "CAUSAL_LM"
25
  }
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:51eee9d9700e6d36b06c92a7a0e9de7328f8f1b637b34c01423c901f28571a2d
3
  size 536906096
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3dd35e68b814416647cebfb59b1a3eb02805721bcd06f0fd64b38a515eeb3965
3
  size 536906096
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 1.54,
3
- "eval_loss": 0.8011999726295471,
4
- "eval_runtime": 117.8304,
5
  "eval_samples": 2000,
6
- "eval_samples_per_second": 16.974,
7
- "eval_steps_per_second": 4.243,
8
- "train_loss": 0.6617152964075407,
9
- "train_runtime": 23627.0915,
10
- "train_samples": 10000,
11
- "train_samples_per_second": 0.65,
12
  "train_steps_per_second": 0.041
13
  }
 
1
  {
2
+ "epoch": 0.04,
3
+ "eval_loss": 0.70841383934021,
4
+ "eval_runtime": 117.6726,
5
  "eval_samples": 2000,
6
+ "eval_samples_per_second": 16.996,
7
+ "eval_steps_per_second": 4.249,
8
+ "train_loss": 0.7602509791321225,
9
+ "train_runtime": 22175.0023,
10
+ "train_samples": 405813,
11
+ "train_samples_per_second": 0.649,
12
  "train_steps_per_second": 0.041
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 1.54,
3
- "eval_loss": 0.8011999726295471,
4
- "eval_runtime": 117.8304,
5
  "eval_samples": 2000,
6
- "eval_samples_per_second": 16.974,
7
- "eval_steps_per_second": 4.243
8
  }
 
1
  {
2
+ "epoch": 0.04,
3
+ "eval_loss": 0.70841383934021,
4
+ "eval_runtime": 117.6726,
5
  "eval_samples": 2000,
6
+ "eval_samples_per_second": 16.996,
7
+ "eval_steps_per_second": 4.249
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 1.54,
3
- "train_loss": 0.6617152964075407,
4
- "train_runtime": 23627.0915,
5
- "train_samples": 10000,
6
- "train_samples_per_second": 0.65,
7
  "train_steps_per_second": 0.041
8
  }
 
1
  {
2
+ "epoch": 0.04,
3
+ "train_loss": 0.7602509791321225,
4
+ "train_runtime": 22175.0023,
5
+ "train_samples": 405813,
6
+ "train_samples_per_second": 0.649,
7
  "train_steps_per_second": 0.041
8
  }
trainer_state.json CHANGED
@@ -1,1226 +1,1154 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.536,
5
- "eval_steps": 192,
6
- "global_step": 960,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
- "learning_rate": 4.1666666666666667e-07,
14
- "loss": 1.5394,
15
  "step": 1
16
  },
17
  {
18
- "epoch": 0.01,
19
- "learning_rate": 2.0833333333333334e-06,
20
- "loss": 1.5312,
21
  "step": 5
22
  },
23
  {
24
- "epoch": 0.02,
25
- "learning_rate": 4.166666666666667e-06,
26
- "loss": 1.4755,
27
  "step": 10
28
  },
29
  {
30
- "epoch": 0.02,
31
- "learning_rate": 6.25e-06,
32
- "loss": 1.4213,
33
  "step": 15
34
  },
35
  {
36
- "epoch": 0.03,
37
- "learning_rate": 8.333333333333334e-06,
38
- "loss": 1.2847,
39
  "step": 20
40
  },
41
  {
42
- "epoch": 0.04,
43
- "learning_rate": 1.0416666666666668e-05,
44
- "loss": 1.1147,
45
  "step": 25
46
  },
47
  {
48
- "epoch": 0.05,
49
- "learning_rate": 1.25e-05,
50
- "loss": 1.0576,
51
  "step": 30
52
  },
53
  {
54
- "epoch": 0.06,
55
- "learning_rate": 1.4583333333333333e-05,
56
- "loss": 1.0231,
57
  "step": 35
58
  },
59
  {
60
- "epoch": 0.06,
61
- "learning_rate": 1.6666666666666667e-05,
62
- "loss": 0.9849,
63
  "step": 40
64
  },
65
  {
66
- "epoch": 0.07,
67
- "learning_rate": 1.8750000000000002e-05,
68
- "loss": 0.9878,
69
  "step": 45
70
  },
71
  {
72
- "epoch": 0.08,
73
- "learning_rate": 2.0833333333333336e-05,
74
- "loss": 0.9602,
75
  "step": 50
76
  },
77
  {
78
- "epoch": 0.09,
79
- "learning_rate": 2.2916666666666667e-05,
80
- "loss": 0.9681,
81
  "step": 55
82
  },
83
  {
84
- "epoch": 0.1,
85
- "learning_rate": 2.5e-05,
86
- "loss": 0.9352,
87
  "step": 60
88
  },
89
  {
90
- "epoch": 0.1,
91
- "learning_rate": 2.7083333333333335e-05,
92
- "loss": 0.9187,
93
  "step": 65
94
  },
95
  {
96
- "epoch": 0.11,
97
- "learning_rate": 2.9166666666666666e-05,
98
- "loss": 0.9051,
99
  "step": 70
100
  },
101
  {
102
- "epoch": 0.12,
103
- "learning_rate": 3.125e-05,
104
- "loss": 0.8898,
105
  "step": 75
106
  },
107
  {
108
- "epoch": 0.13,
109
- "learning_rate": 3.3333333333333335e-05,
110
- "loss": 0.8779,
111
  "step": 80
112
  },
113
  {
114
- "epoch": 0.14,
115
- "learning_rate": 3.541666666666667e-05,
116
- "loss": 0.8798,
117
  "step": 85
118
  },
119
  {
120
- "epoch": 0.14,
121
- "learning_rate": 3.7500000000000003e-05,
122
- "loss": 0.8861,
123
  "step": 90
124
  },
125
  {
126
- "epoch": 0.15,
127
- "learning_rate": 3.958333333333334e-05,
128
- "loss": 0.8748,
129
  "step": 95
130
  },
131
  {
132
- "epoch": 0.16,
133
- "learning_rate": 3.999788463854215e-05,
134
- "loss": 0.8486,
135
  "step": 100
136
  },
137
  {
138
- "epoch": 0.17,
139
- "learning_rate": 3.9989291749527314e-05,
140
- "loss": 0.8652,
141
  "step": 105
142
  },
143
  {
144
- "epoch": 0.18,
145
- "learning_rate": 3.997409196081781e-05,
146
- "loss": 0.8443,
147
  "step": 110
148
  },
149
  {
150
- "epoch": 0.18,
151
- "learning_rate": 3.9952290296277454e-05,
152
- "loss": 0.8305,
153
  "step": 115
154
  },
155
  {
156
- "epoch": 0.19,
157
- "learning_rate": 3.9923893961834914e-05,
158
- "loss": 0.8217,
159
  "step": 120
160
  },
161
  {
162
- "epoch": 0.2,
163
- "learning_rate": 3.988891234310205e-05,
164
- "loss": 0.8284,
165
  "step": 125
166
  },
167
  {
168
- "epoch": 0.21,
169
- "learning_rate": 3.98473570022717e-05,
170
- "loss": 0.7975,
171
  "step": 130
172
  },
173
  {
174
- "epoch": 0.22,
175
- "learning_rate": 3.979924167429616e-05,
176
- "loss": 0.769,
177
  "step": 135
178
  },
179
  {
180
- "epoch": 0.22,
181
- "learning_rate": 3.9744582262347486e-05,
182
- "loss": 0.7685,
183
  "step": 140
184
  },
185
  {
186
- "epoch": 0.23,
187
- "learning_rate": 3.968339683256111e-05,
188
- "loss": 0.7308,
189
  "step": 145
190
  },
191
  {
192
- "epoch": 0.24,
193
- "learning_rate": 3.961570560806461e-05,
194
- "loss": 0.74,
195
  "step": 150
196
  },
197
  {
198
- "epoch": 0.25,
199
- "learning_rate": 3.954153096229354e-05,
200
- "loss": 0.7414,
201
  "step": 155
202
  },
203
  {
204
- "epoch": 0.26,
205
- "learning_rate": 3.946089741159648e-05,
206
- "loss": 0.7143,
207
  "step": 160
208
  },
209
  {
210
- "epoch": 0.26,
211
- "learning_rate": 3.937383160713187e-05,
212
- "loss": 0.7298,
213
  "step": 165
214
  },
215
  {
216
- "epoch": 0.27,
217
- "learning_rate": 3.9280362326059194e-05,
218
- "loss": 0.7259,
219
  "step": 170
220
  },
221
  {
222
- "epoch": 0.28,
223
- "learning_rate": 3.918052046202755e-05,
224
- "loss": 0.7032,
225
  "step": 175
226
  },
227
  {
228
- "epoch": 0.29,
229
- "learning_rate": 3.907433901496454e-05,
230
- "loss": 0.7353,
231
  "step": 180
232
  },
233
  {
234
- "epoch": 0.3,
235
- "learning_rate": 3.8961853080169156e-05,
236
- "loss": 0.7075,
237
- "step": 185
 
 
238
  },
239
  {
240
- "epoch": 0.3,
241
- "learning_rate": 3.884309983671193e-05,
242
- "loss": 0.7046,
243
- "step": 190
244
  },
245
  {
246
- "epoch": 0.31,
247
- "eval_loss": 0.7329480648040771,
248
- "eval_runtime": 118.1096,
249
- "eval_samples_per_second": 16.933,
250
- "eval_steps_per_second": 4.233,
251
- "step": 192
252
  },
253
  {
254
- "epoch": 0.31,
255
- "learning_rate": 3.871811853514652e-05,
256
- "loss": 0.7085,
257
  "step": 195
258
  },
259
  {
260
- "epoch": 0.32,
261
- "learning_rate": 3.858695048453645e-05,
262
- "loss": 0.7113,
263
  "step": 200
264
  },
265
  {
266
- "epoch": 0.33,
267
- "learning_rate": 3.844963903880165e-05,
268
- "loss": 0.7117,
269
  "step": 205
270
  },
271
  {
272
- "epoch": 0.34,
273
- "learning_rate": 3.830622958238895e-05,
274
- "loss": 0.7031,
275
  "step": 210
276
  },
277
  {
278
- "epoch": 0.34,
279
- "learning_rate": 3.815676951527158e-05,
280
- "loss": 0.7057,
281
  "step": 215
282
  },
283
  {
284
- "epoch": 0.35,
285
- "learning_rate": 3.800130823728242e-05,
286
- "loss": 0.7072,
287
  "step": 220
288
  },
289
  {
290
- "epoch": 0.36,
291
- "learning_rate": 3.783989713178629e-05,
292
- "loss": 0.7006,
293
  "step": 225
294
  },
295
  {
296
- "epoch": 0.37,
297
- "learning_rate": 3.767258954869656e-05,
298
- "loss": 0.6969,
299
  "step": 230
300
  },
301
  {
302
- "epoch": 0.38,
303
- "learning_rate": 3.7499440786841897e-05,
304
- "loss": 0.6878,
305
  "step": 235
306
  },
307
  {
308
- "epoch": 0.38,
309
- "learning_rate": 3.732050807568878e-05,
310
- "loss": 0.6901,
311
  "step": 240
312
  },
313
  {
314
- "epoch": 0.39,
315
- "learning_rate": 3.713585055642586e-05,
316
- "loss": 0.6812,
317
  "step": 245
318
  },
319
  {
320
- "epoch": 0.4,
321
- "learning_rate": 3.694552926241656e-05,
322
- "loss": 0.6854,
323
  "step": 250
324
  },
325
  {
326
- "epoch": 0.41,
327
- "learning_rate": 3.674960709902616e-05,
328
- "loss": 0.6871,
329
  "step": 255
330
  },
331
  {
332
- "epoch": 0.42,
333
- "learning_rate": 3.654814882283021e-05,
334
- "loss": 0.6824,
335
  "step": 260
336
  },
337
  {
338
- "epoch": 0.42,
339
- "learning_rate": 3.634122102021108e-05,
340
- "loss": 0.6909,
341
  "step": 265
342
  },
343
  {
344
- "epoch": 0.43,
345
- "learning_rate": 3.612889208534966e-05,
346
- "loss": 0.6871,
347
  "step": 270
348
  },
349
  {
350
- "epoch": 0.44,
351
- "learning_rate": 3.59112321976196e-05,
352
- "loss": 0.6748,
353
  "step": 275
354
  },
355
  {
356
- "epoch": 0.45,
357
- "learning_rate": 3.568831329839152e-05,
358
- "loss": 0.688,
359
  "step": 280
360
  },
361
  {
362
- "epoch": 0.46,
363
- "learning_rate": 3.546020906725474e-05,
364
- "loss": 0.6806,
365
  "step": 285
366
  },
367
  {
368
- "epoch": 0.46,
369
- "learning_rate": 3.522699489766462e-05,
370
- "loss": 0.662,
371
  "step": 290
372
  },
373
  {
374
- "epoch": 0.47,
375
- "learning_rate": 3.498874787202335e-05,
376
- "loss": 0.6766,
377
  "step": 295
378
  },
379
  {
380
- "epoch": 0.48,
381
- "learning_rate": 3.474554673620248e-05,
382
- "loss": 0.6815,
383
  "step": 300
384
  },
385
  {
386
- "epoch": 0.49,
387
- "learning_rate": 3.4497471873515765e-05,
388
- "loss": 0.6581,
389
  "step": 305
390
  },
391
  {
392
- "epoch": 0.5,
393
- "learning_rate": 3.4244605278150625e-05,
394
- "loss": 0.6509,
395
  "step": 310
396
  },
397
  {
398
- "epoch": 0.5,
399
- "learning_rate": 3.398703052806734e-05,
400
- "loss": 0.6658,
401
  "step": 315
402
  },
403
  {
404
- "epoch": 0.51,
405
- "learning_rate": 3.372483275737468e-05,
406
- "loss": 0.6653,
407
  "step": 320
408
  },
409
  {
410
- "epoch": 0.52,
411
- "learning_rate": 3.3458098628191155e-05,
412
- "loss": 0.6331,
413
  "step": 325
414
  },
415
  {
416
- "epoch": 0.53,
417
- "learning_rate": 3.318691630200138e-05,
418
- "loss": 0.669,
419
  "step": 330
420
  },
421
  {
422
- "epoch": 0.54,
423
- "learning_rate": 3.2911375410516696e-05,
424
- "loss": 0.6525,
425
  "step": 335
426
  },
427
  {
428
- "epoch": 0.54,
429
- "learning_rate": 3.2631567026049954e-05,
430
- "loss": 0.6726,
431
  "step": 340
432
  },
433
  {
434
- "epoch": 0.55,
435
- "learning_rate": 3.2347583631414106e-05,
436
- "loss": 0.6596,
437
  "step": 345
438
  },
439
  {
440
- "epoch": 0.56,
441
- "learning_rate": 3.2059519089354595e-05,
442
- "loss": 0.6587,
443
  "step": 350
444
  },
445
  {
446
- "epoch": 0.57,
447
- "learning_rate": 3.176746861152569e-05,
448
- "loss": 0.6504,
449
  "step": 355
450
  },
451
  {
452
- "epoch": 0.58,
453
- "learning_rate": 3.147152872702092e-05,
454
- "loss": 0.6465,
 
 
 
 
 
 
 
 
455
  "step": 360
456
  },
457
  {
458
- "epoch": 0.58,
459
- "learning_rate": 3.1171797250468094e-05,
460
- "loss": 0.6547,
461
  "step": 365
462
  },
463
  {
464
- "epoch": 0.59,
465
- "learning_rate": 3.08683732496994e-05,
466
- "loss": 0.6353,
467
  "step": 370
468
  },
469
  {
470
- "epoch": 0.6,
471
- "learning_rate": 3.056135701300736e-05,
472
- "loss": 0.6494,
473
  "step": 375
474
  },
475
  {
476
- "epoch": 0.61,
477
- "learning_rate": 3.0250850015997307e-05,
478
- "loss": 0.6521,
479
  "step": 380
480
  },
481
  {
482
- "epoch": 0.61,
483
- "eval_loss": 0.7474338412284851,
484
- "eval_runtime": 117.8174,
485
- "eval_samples_per_second": 16.975,
486
- "eval_steps_per_second": 4.244,
487
- "step": 384
488
- },
489
- {
490
- "epoch": 0.62,
491
- "learning_rate": 2.9936954888047478e-05,
492
- "loss": 0.6363,
493
  "step": 385
494
  },
495
  {
496
- "epoch": 0.62,
497
- "learning_rate": 2.9619775378387756e-05,
498
- "loss": 0.6388,
499
  "step": 390
500
  },
501
  {
502
- "epoch": 0.63,
503
- "learning_rate": 2.9299416321808284e-05,
504
- "loss": 0.6429,
505
  "step": 395
506
  },
507
  {
508
- "epoch": 0.64,
509
- "learning_rate": 2.897598360400925e-05,
510
- "loss": 0.6182,
511
  "step": 400
512
  },
513
  {
514
- "epoch": 0.65,
515
- "learning_rate": 2.8649584126603325e-05,
516
- "loss": 0.6279,
517
  "step": 405
518
  },
519
  {
520
- "epoch": 0.66,
521
- "learning_rate": 2.8320325771782387e-05,
522
- "loss": 0.634,
523
  "step": 410
524
  },
525
  {
526
- "epoch": 0.66,
527
- "learning_rate": 2.798831736666001e-05,
528
- "loss": 0.6278,
529
  "step": 415
530
  },
531
  {
532
- "epoch": 0.67,
533
- "learning_rate": 2.7653668647301797e-05,
534
- "loss": 0.6298,
535
  "step": 420
536
  },
537
  {
538
- "epoch": 0.68,
539
- "learning_rate": 2.7316490222455143e-05,
540
- "loss": 0.619,
541
  "step": 425
542
  },
543
  {
544
- "epoch": 0.69,
545
- "learning_rate": 2.6976893536990618e-05,
546
- "loss": 0.6308,
547
  "step": 430
548
  },
549
  {
550
- "epoch": 0.7,
551
- "learning_rate": 2.6634990835067046e-05,
552
- "loss": 0.6138,
553
  "step": 435
554
  },
555
  {
556
- "epoch": 0.7,
557
- "learning_rate": 2.6290895123032277e-05,
558
- "loss": 0.6394,
559
  "step": 440
560
  },
561
  {
562
- "epoch": 0.71,
563
- "learning_rate": 2.5944720132072156e-05,
564
- "loss": 0.6215,
565
  "step": 445
566
  },
567
  {
568
- "epoch": 0.72,
569
- "learning_rate": 2.5596580280619847e-05,
570
- "loss": 0.628,
571
  "step": 450
572
  },
573
  {
574
- "epoch": 0.73,
575
- "learning_rate": 2.5246590636538035e-05,
576
- "loss": 0.6311,
577
  "step": 455
578
  },
579
  {
580
- "epoch": 0.74,
581
- "learning_rate": 2.4894866879086478e-05,
582
- "loss": 0.6283,
583
  "step": 460
584
  },
585
  {
586
- "epoch": 0.74,
587
- "learning_rate": 2.4541525260687468e-05,
588
- "loss": 0.6134,
589
  "step": 465
590
  },
591
  {
592
- "epoch": 0.75,
593
- "learning_rate": 2.4186682568501844e-05,
594
- "loss": 0.6114,
595
  "step": 470
596
  },
597
  {
598
- "epoch": 0.76,
599
- "learning_rate": 2.3830456085828288e-05,
600
- "loss": 0.5984,
601
  "step": 475
602
  },
603
  {
604
- "epoch": 0.77,
605
- "learning_rate": 2.3472963553338614e-05,
606
- "loss": 0.6236,
607
  "step": 480
608
  },
609
  {
610
- "epoch": 0.78,
611
- "learning_rate": 2.311432313016188e-05,
612
- "loss": 0.618,
613
  "step": 485
614
  },
615
  {
616
- "epoch": 0.78,
617
- "learning_rate": 2.2754653354830215e-05,
618
- "loss": 0.6117,
619
  "step": 490
620
  },
621
  {
622
- "epoch": 0.79,
623
- "learning_rate": 2.239407310609925e-05,
624
- "loss": 0.6095,
625
  "step": 495
626
  },
627
  {
628
- "epoch": 0.8,
629
- "learning_rate": 2.203270156365604e-05,
630
- "loss": 0.6131,
631
  "step": 500
632
  },
633
  {
634
- "epoch": 0.81,
635
- "learning_rate": 2.1670658168727575e-05,
636
- "loss": 0.5883,
637
  "step": 505
638
  },
639
  {
640
- "epoch": 0.82,
641
- "learning_rate": 2.1308062584602865e-05,
642
- "loss": 0.5988,
643
  "step": 510
644
  },
645
  {
646
- "epoch": 0.82,
647
- "learning_rate": 2.094503465708154e-05,
648
- "loss": 0.5863,
649
  "step": 515
650
  },
651
  {
652
- "epoch": 0.83,
653
- "learning_rate": 2.058169437486223e-05,
654
- "loss": 0.6016,
655
  "step": 520
656
  },
657
  {
658
- "epoch": 0.84,
659
- "learning_rate": 2.021816182988365e-05,
660
- "loss": 0.6133,
661
  "step": 525
662
  },
663
  {
664
- "epoch": 0.85,
665
- "learning_rate": 1.985455717763157e-05,
666
- "loss": 0.5928,
667
  "step": 530
668
  },
669
  {
670
- "epoch": 0.86,
671
- "learning_rate": 1.94910005974248e-05,
672
- "loss": 0.6039,
673
  "step": 535
674
  },
675
  {
676
- "epoch": 0.86,
677
- "learning_rate": 1.9127612252693285e-05,
678
- "loss": 0.5839,
 
 
 
 
 
 
 
 
679
  "step": 540
680
  },
681
  {
682
- "epoch": 0.87,
683
- "learning_rate": 1.8764512251261444e-05,
684
- "loss": 0.5895,
685
  "step": 545
686
  },
687
  {
688
- "epoch": 0.88,
689
- "learning_rate": 1.8401820605649928e-05,
690
- "loss": 0.593,
691
  "step": 550
692
  },
693
  {
694
- "epoch": 0.89,
695
- "learning_rate": 1.8039657193408788e-05,
696
- "loss": 0.5969,
697
  "step": 555
698
  },
699
  {
700
- "epoch": 0.9,
701
- "learning_rate": 1.7678141717495394e-05,
702
- "loss": 0.6023,
703
  "step": 560
704
  },
705
  {
706
- "epoch": 0.9,
707
- "learning_rate": 1.7317393666709918e-05,
708
- "loss": 0.5852,
709
  "step": 565
710
  },
711
  {
712
- "epoch": 0.91,
713
- "learning_rate": 1.6957532276201668e-05,
714
- "loss": 0.5868,
715
  "step": 570
716
  },
717
  {
718
- "epoch": 0.92,
719
- "learning_rate": 1.6598676488059292e-05,
720
- "loss": 0.5824,
721
  "step": 575
722
  },
723
  {
724
- "epoch": 0.92,
725
- "eval_loss": 0.7729184031486511,
726
- "eval_runtime": 117.9572,
727
- "eval_samples_per_second": 16.955,
728
- "eval_steps_per_second": 4.239,
729
- "step": 576
730
- },
731
- {
732
- "epoch": 0.93,
733
- "learning_rate": 1.6240944911997765e-05,
734
- "loss": 0.5815,
735
  "step": 580
736
  },
737
  {
738
- "epoch": 0.94,
739
- "learning_rate": 1.5884455786155304e-05,
740
- "loss": 0.5935,
741
  "step": 585
742
  },
743
  {
744
- "epoch": 0.94,
745
- "learning_rate": 1.5529326938013053e-05,
746
- "loss": 0.592,
747
  "step": 590
748
  },
749
  {
750
- "epoch": 0.95,
751
- "learning_rate": 1.5175675745450513e-05,
752
- "loss": 0.5871,
753
  "step": 595
754
  },
755
  {
756
- "epoch": 0.96,
757
- "learning_rate": 1.4823619097949584e-05,
758
- "loss": 0.5914,
759
  "step": 600
760
  },
761
  {
762
- "epoch": 0.97,
763
- "learning_rate": 1.4473273357960035e-05,
764
- "loss": 0.5819,
765
  "step": 605
766
  },
767
  {
768
- "epoch": 0.98,
769
- "learning_rate": 1.4124754322439112e-05,
770
- "loss": 0.5677,
771
  "step": 610
772
  },
773
  {
774
- "epoch": 0.98,
775
- "learning_rate": 1.3778177184578185e-05,
776
- "loss": 0.585,
777
  "step": 615
778
  },
779
  {
780
- "epoch": 0.99,
781
- "learning_rate": 1.3433656495728781e-05,
782
- "loss": 0.571,
783
  "step": 620
784
  },
785
  {
786
- "epoch": 1.0,
787
- "learning_rate": 1.3091306127540916e-05,
788
- "loss": 0.5714,
789
  "step": 625
790
  },
791
  {
792
- "epoch": 1.01,
793
- "learning_rate": 1.275123923432597e-05,
794
- "loss": 0.5794,
795
  "step": 630
796
  },
797
  {
798
- "epoch": 1.02,
799
- "learning_rate": 1.2413568215656735e-05,
800
- "loss": 0.5685,
801
  "step": 635
802
  },
803
  {
804
- "epoch": 1.02,
805
- "learning_rate": 1.2078404679216864e-05,
806
- "loss": 0.58,
807
  "step": 640
808
  },
809
  {
810
- "epoch": 1.03,
811
- "learning_rate": 1.1745859403912108e-05,
812
- "loss": 0.5585,
813
  "step": 645
814
  },
815
  {
816
- "epoch": 1.04,
817
- "learning_rate": 1.1416042303255424e-05,
818
- "loss": 0.5733,
819
  "step": 650
820
  },
821
  {
822
- "epoch": 1.05,
823
- "learning_rate": 1.1089062389038175e-05,
824
- "loss": 0.5736,
825
  "step": 655
826
  },
827
  {
828
- "epoch": 1.06,
829
- "learning_rate": 1.0765027735299327e-05,
830
- "loss": 0.5593,
831
  "step": 660
832
  },
833
  {
834
- "epoch": 1.06,
835
- "learning_rate": 1.04440454426046e-05,
836
- "loss": 0.5623,
837
  "step": 665
838
  },
839
  {
840
- "epoch": 1.07,
841
- "learning_rate": 1.0126221602647395e-05,
842
- "loss": 0.5709,
843
  "step": 670
844
  },
845
  {
846
- "epoch": 1.08,
847
- "learning_rate": 9.811661263183165e-06,
848
- "loss": 0.5722,
849
  "step": 675
850
  },
851
  {
852
- "epoch": 1.09,
853
- "learning_rate": 9.5004683933088e-06,
854
- "loss": 0.5787,
855
  "step": 680
856
  },
857
  {
858
- "epoch": 1.1,
859
- "learning_rate": 9.192745849098575e-06,
860
- "loss": 0.5841,
861
  "step": 685
862
  },
863
  {
864
- "epoch": 1.1,
865
- "learning_rate": 8.888595339607961e-06,
866
- "loss": 0.5594,
867
  "step": 690
868
  },
869
  {
870
- "epoch": 1.11,
871
- "learning_rate": 8.588117393256543e-06,
872
- "loss": 0.5544,
873
  "step": 695
874
  },
875
  {
876
- "epoch": 1.12,
877
- "learning_rate": 8.291411324601191e-06,
878
- "loss": 0.5747,
879
  "step": 700
880
  },
881
  {
882
- "epoch": 1.13,
883
- "learning_rate": 7.998575201510383e-06,
884
- "loss": 0.5602,
885
  "step": 705
886
  },
887
  {
888
- "epoch": 1.14,
889
- "learning_rate": 7.709705812750651e-06,
890
- "loss": 0.5597,
891
  "step": 710
892
  },
893
  {
894
- "epoch": 1.14,
895
- "learning_rate": 7.4248986359957474e-06,
896
- "loss": 0.5536,
897
  "step": 715
898
  },
899
  {
900
- "epoch": 1.15,
901
- "learning_rate": 7.1442478062692135e-06,
902
- "loss": 0.5674,
903
  "step": 720
904
  },
905
  {
906
- "epoch": 1.16,
907
- "learning_rate": 6.867846084830645e-06,
908
- "loss": 0.5669,
 
 
 
 
 
 
 
 
909
  "step": 725
910
  },
911
  {
912
- "epoch": 1.17,
913
- "learning_rate": 6.595784828516085e-06,
914
- "loss": 0.5465,
915
  "step": 730
916
  },
917
  {
918
- "epoch": 1.18,
919
- "learning_rate": 6.328153959542573e-06,
920
- "loss": 0.566,
921
  "step": 735
922
  },
923
  {
924
- "epoch": 1.18,
925
- "learning_rate": 6.065041935786906e-06,
926
- "loss": 0.5553,
927
  "step": 740
928
  },
929
  {
930
- "epoch": 1.19,
931
- "learning_rate": 5.806535721548305e-06,
932
- "loss": 0.5761,
933
  "step": 745
934
  },
935
  {
936
- "epoch": 1.2,
937
- "learning_rate": 5.55272075880489e-06,
938
- "loss": 0.5616,
939
  "step": 750
940
  },
941
  {
942
- "epoch": 1.21,
943
- "learning_rate": 5.303680938973164e-06,
944
- "loss": 0.573,
945
  "step": 755
946
  },
947
  {
948
- "epoch": 1.22,
949
- "learning_rate": 5.059498575180084e-06,
950
- "loss": 0.5407,
951
  "step": 760
952
  },
953
  {
954
- "epoch": 1.22,
955
- "learning_rate": 4.8202543750567635e-06,
956
- "loss": 0.5575,
957
  "step": 765
958
  },
959
  {
960
- "epoch": 1.23,
961
- "eval_loss": 0.7963955998420715,
962
- "eval_runtime": 117.8913,
963
- "eval_samples_per_second": 16.965,
964
- "eval_steps_per_second": 4.241,
965
- "step": 768
966
- },
967
- {
968
- "epoch": 1.23,
969
- "learning_rate": 4.586027414062839e-06,
970
- "loss": 0.5603,
971
  "step": 770
972
  },
973
  {
974
- "epoch": 1.24,
975
- "learning_rate": 4.356895109350272e-06,
976
- "loss": 0.5504,
977
  "step": 775
978
  },
979
  {
980
- "epoch": 1.25,
981
- "learning_rate": 4.132933194175299e-06,
982
- "loss": 0.5396,
983
  "step": 780
984
  },
985
  {
986
- "epoch": 1.26,
987
- "learning_rate": 3.914215692866918e-06,
988
- "loss": 0.5648,
989
  "step": 785
990
  },
991
  {
992
- "epoch": 1.26,
993
- "learning_rate": 3.7008148963602474e-06,
994
- "loss": 0.5547,
995
  "step": 790
996
  },
997
  {
998
- "epoch": 1.27,
999
- "learning_rate": 3.4928013383027247e-06,
1000
- "loss": 0.5439,
1001
  "step": 795
1002
  },
1003
  {
1004
- "epoch": 1.28,
1005
- "learning_rate": 3.290243771741275e-06,
1006
- "loss": 0.5485,
1007
  "step": 800
1008
  },
1009
  {
1010
- "epoch": 1.29,
1011
- "learning_rate": 3.0932091463978397e-06,
1012
- "loss": 0.54,
1013
  "step": 805
1014
  },
1015
  {
1016
- "epoch": 1.3,
1017
- "learning_rate": 2.9017625865410727e-06,
1018
- "loss": 0.5524,
1019
  "step": 810
1020
  },
1021
  {
1022
- "epoch": 1.3,
1023
- "learning_rate": 2.715967369461314e-06,
1024
- "loss": 0.5412,
1025
  "step": 815
1026
  },
1027
  {
1028
- "epoch": 1.31,
1029
- "learning_rate": 2.535884904556085e-06,
1030
- "loss": 0.5538,
1031
  "step": 820
1032
  },
1033
  {
1034
- "epoch": 1.32,
1035
- "learning_rate": 2.3615747130329013e-06,
1036
- "loss": 0.5322,
1037
  "step": 825
1038
  },
1039
  {
1040
- "epoch": 1.33,
1041
- "learning_rate": 2.1930944082362204e-06,
1042
- "loss": 0.5485,
1043
  "step": 830
1044
  },
1045
  {
1046
- "epoch": 1.34,
1047
- "learning_rate": 2.0304996766049844e-06,
1048
- "loss": 0.5451,
1049
  "step": 835
1050
  },
1051
  {
1052
- "epoch": 1.34,
1053
- "learning_rate": 1.8738442592670014e-06,
1054
- "loss": 0.5315,
1055
  "step": 840
1056
  },
1057
  {
1058
- "epoch": 1.35,
1059
- "learning_rate": 1.7231799342763379e-06,
1060
- "loss": 0.5457,
1061
  "step": 845
1062
  },
1063
  {
1064
- "epoch": 1.36,
1065
- "learning_rate": 1.5785564994995284e-06,
1066
- "loss": 0.5476,
1067
  "step": 850
1068
  },
1069
  {
1070
- "epoch": 1.37,
1071
- "learning_rate": 1.4400217561563112e-06,
1072
- "loss": 0.5637,
1073
  "step": 855
1074
  },
1075
  {
1076
- "epoch": 1.38,
1077
- "learning_rate": 1.3076214930202324e-06,
1078
- "loss": 0.5294,
1079
  "step": 860
1080
  },
1081
  {
1082
- "epoch": 1.38,
1083
- "learning_rate": 1.1813994712844922e-06,
1084
- "loss": 0.5446,
1085
  "step": 865
1086
  },
1087
  {
1088
- "epoch": 1.39,
1089
- "learning_rate": 1.0613974100978885e-06,
1090
- "loss": 0.5607,
1091
  "step": 870
1092
  },
1093
  {
1094
- "epoch": 1.4,
1095
- "learning_rate": 9.476549727757267e-07,
1096
- "loss": 0.5386,
1097
  "step": 875
1098
  },
1099
  {
1100
- "epoch": 1.41,
1101
- "learning_rate": 8.402097536902221e-07,
1102
- "loss": 0.5545,
1103
  "step": 880
1104
  },
1105
  {
1106
- "epoch": 1.42,
1107
- "learning_rate": 7.390972658447459e-07,
1108
- "loss": 0.5361,
1109
  "step": 885
1110
  },
1111
  {
1112
- "epoch": 1.42,
1113
- "learning_rate": 6.443509291359817e-07,
1114
- "loss": 0.5461,
1115
  "step": 890
1116
  },
1117
  {
1118
- "epoch": 1.43,
1119
- "learning_rate": 5.56002059307923e-07,
1120
- "loss": 0.546,
1121
  "step": 895
1122
  },
1123
  {
1124
- "epoch": 1.44,
1125
- "learning_rate": 4.740798576013328e-07,
1126
- "loss": 0.5472,
1127
- "step": 900
1128
- },
1129
- {
1130
- "epoch": 1.45,
1131
- "learning_rate": 3.9861140110209806e-07,
1132
- "loss": 0.5339,
1133
- "step": 905
1134
- },
1135
- {
1136
- "epoch": 1.46,
1137
- "learning_rate": 3.296216337916458e-07,
1138
- "loss": 0.536,
1139
- "step": 910
1140
- },
1141
- {
1142
- "epoch": 1.46,
1143
- "learning_rate": 2.671333583024205e-07,
1144
- "loss": 0.5434,
1145
- "step": 915
1146
- },
1147
- {
1148
- "epoch": 1.47,
1149
- "learning_rate": 2.111672283811106e-07,
1150
- "loss": 0.5483,
1151
- "step": 920
1152
- },
1153
- {
1154
- "epoch": 1.48,
1155
- "learning_rate": 1.6174174206212922e-07,
1156
- "loss": 0.5437,
1157
- "step": 925
1158
- },
1159
- {
1160
- "epoch": 1.49,
1161
- "learning_rate": 1.1887323555360708e-07,
1162
- "loss": 0.5446,
1163
- "step": 930
1164
- },
1165
- {
1166
- "epoch": 1.5,
1167
- "learning_rate": 8.25758778379293e-08,
1168
- "loss": 0.5457,
1169
- "step": 935
1170
- },
1171
- {
1172
- "epoch": 1.5,
1173
- "learning_rate": 5.286166598855502e-08,
1174
- "loss": 0.5429,
1175
- "step": 940
1176
- },
1177
- {
1178
- "epoch": 1.51,
1179
- "learning_rate": 2.974042120473808e-08,
1180
- "loss": 0.5405,
1181
- "step": 945
1182
- },
1183
- {
1184
- "epoch": 1.52,
1185
- "learning_rate": 1.3219785565399268e-08,
1186
- "loss": 0.5388,
1187
- "step": 950
1188
- },
1189
- {
1190
- "epoch": 1.53,
1191
- "learning_rate": 3.305219503249024e-09,
1192
- "loss": 0.5402,
1193
- "step": 955
1194
- },
1195
- {
1196
- "epoch": 1.54,
1197
  "learning_rate": 0.0,
1198
- "loss": 0.5371,
1199
- "step": 960
1200
  },
1201
  {
1202
- "epoch": 1.54,
1203
- "eval_loss": 0.8011998534202576,
1204
- "eval_runtime": 117.9168,
1205
- "eval_samples_per_second": 16.961,
1206
- "eval_steps_per_second": 4.24,
1207
- "step": 960
1208
  },
1209
  {
1210
- "epoch": 1.54,
1211
- "step": 960,
1212
- "total_flos": 1.2724395167155487e+18,
1213
- "train_loss": 0.6617152964075407,
1214
- "train_runtime": 23627.0915,
1215
- "train_samples_per_second": 0.65,
1216
  "train_steps_per_second": 0.041
1217
  }
1218
  ],
1219
  "logging_steps": 5,
1220
- "max_steps": 960,
1221
- "num_train_epochs": 2,
1222
- "save_steps": 192,
1223
- "total_flos": 1.2724395167155487e+18,
1224
  "trial_name": null,
1225
  "trial_params": null
1226
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.03548406174226743,
5
+ "eval_steps": 180,
6
+ "global_step": 900,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
+ "learning_rate": 4.444444444444445e-07,
14
+ "loss": 1.4612,
15
  "step": 1
16
  },
17
  {
18
+ "epoch": 0.0,
19
+ "learning_rate": 2.222222222222222e-06,
20
+ "loss": 1.4717,
21
  "step": 5
22
  },
23
  {
24
+ "epoch": 0.0,
25
+ "learning_rate": 4.444444444444444e-06,
26
+ "loss": 1.4887,
27
  "step": 10
28
  },
29
  {
30
+ "epoch": 0.0,
31
+ "learning_rate": 6.666666666666667e-06,
32
+ "loss": 1.4126,
33
  "step": 15
34
  },
35
  {
36
+ "epoch": 0.0,
37
+ "learning_rate": 8.888888888888888e-06,
38
+ "loss": 1.2639,
39
  "step": 20
40
  },
41
  {
42
+ "epoch": 0.0,
43
+ "learning_rate": 1.1111111111111113e-05,
44
+ "loss": 1.1005,
45
  "step": 25
46
  },
47
  {
48
+ "epoch": 0.0,
49
+ "learning_rate": 1.3333333333333333e-05,
50
+ "loss": 1.0341,
51
  "step": 30
52
  },
53
  {
54
+ "epoch": 0.0,
55
+ "learning_rate": 1.555555555555556e-05,
56
+ "loss": 1.0016,
57
  "step": 35
58
  },
59
  {
60
+ "epoch": 0.0,
61
+ "learning_rate": 1.7777777777777777e-05,
62
+ "loss": 0.9879,
63
  "step": 40
64
  },
65
  {
66
+ "epoch": 0.0,
67
+ "learning_rate": 2e-05,
68
+ "loss": 0.947,
69
  "step": 45
70
  },
71
  {
72
+ "epoch": 0.0,
73
+ "learning_rate": 2.2222222222222227e-05,
74
+ "loss": 0.9558,
75
  "step": 50
76
  },
77
  {
78
+ "epoch": 0.0,
79
+ "learning_rate": 2.444444444444445e-05,
80
+ "loss": 0.9208,
81
  "step": 55
82
  },
83
  {
84
+ "epoch": 0.0,
85
+ "learning_rate": 2.6666666666666667e-05,
86
+ "loss": 0.9096,
87
  "step": 60
88
  },
89
  {
90
+ "epoch": 0.0,
91
+ "learning_rate": 2.888888888888889e-05,
92
+ "loss": 0.8924,
93
  "step": 65
94
  },
95
  {
96
+ "epoch": 0.0,
97
+ "learning_rate": 3.111111111111112e-05,
98
+ "loss": 0.8883,
99
  "step": 70
100
  },
101
  {
102
+ "epoch": 0.0,
103
+ "learning_rate": 3.3333333333333335e-05,
104
+ "loss": 0.879,
105
  "step": 75
106
  },
107
  {
108
+ "epoch": 0.0,
109
+ "learning_rate": 3.555555555555555e-05,
110
+ "loss": 0.8643,
111
  "step": 80
112
  },
113
  {
114
+ "epoch": 0.0,
115
+ "learning_rate": 3.777777777777778e-05,
116
+ "loss": 0.8491,
117
  "step": 85
118
  },
119
  {
120
+ "epoch": 0.0,
121
+ "learning_rate": 4e-05,
122
+ "loss": 0.8485,
123
  "step": 90
124
  },
125
  {
126
+ "epoch": 0.0,
127
+ "learning_rate": 3.999623940897003e-05,
128
+ "loss": 0.8734,
129
  "step": 95
130
  },
131
  {
132
+ "epoch": 0.0,
133
+ "learning_rate": 3.998495905008461e-05,
134
+ "loss": 0.8522,
135
  "step": 100
136
  },
137
  {
138
+ "epoch": 0.0,
139
+ "learning_rate": 3.996616316542537e-05,
140
+ "loss": 0.8434,
141
  "step": 105
142
  },
143
  {
144
+ "epoch": 0.0,
145
+ "learning_rate": 3.993985882335584e-05,
146
+ "loss": 0.8312,
147
  "step": 110
148
  },
149
  {
150
+ "epoch": 0.0,
151
+ "learning_rate": 3.9906055915863316e-05,
152
+ "loss": 0.8248,
153
  "step": 115
154
  },
155
  {
156
+ "epoch": 0.0,
157
+ "learning_rate": 3.9864767154838864e-05,
158
+ "loss": 0.8387,
159
  "step": 120
160
  },
161
  {
162
+ "epoch": 0.0,
163
+ "learning_rate": 3.9816008067296905e-05,
164
+ "loss": 0.8191,
165
  "step": 125
166
  },
167
  {
168
+ "epoch": 0.01,
169
+ "learning_rate": 3.9759796989536185e-05,
170
+ "loss": 0.8047,
171
  "step": 130
172
  },
173
  {
174
+ "epoch": 0.01,
175
+ "learning_rate": 3.9696155060244166e-05,
176
+ "loss": 0.7999,
177
  "step": 135
178
  },
179
  {
180
+ "epoch": 0.01,
181
+ "learning_rate": 3.9625106212547696e-05,
182
+ "loss": 0.7841,
183
  "step": 140
184
  },
185
  {
186
+ "epoch": 0.01,
187
+ "learning_rate": 3.9546677165012714e-05,
188
+ "loss": 0.7546,
189
  "step": 145
190
  },
191
  {
192
+ "epoch": 0.01,
193
+ "learning_rate": 3.946089741159648e-05,
194
+ "loss": 0.7539,
195
  "step": 150
196
  },
197
  {
198
+ "epoch": 0.01,
199
+ "learning_rate": 3.9367799210556124e-05,
200
+ "loss": 0.7346,
201
  "step": 155
202
  },
203
  {
204
+ "epoch": 0.01,
205
+ "learning_rate": 3.926741757231761e-05,
206
+ "loss": 0.7315,
207
  "step": 160
208
  },
209
  {
210
+ "epoch": 0.01,
211
+ "learning_rate": 3.915979024630978e-05,
212
+ "loss": 0.744,
213
  "step": 165
214
  },
215
  {
216
+ "epoch": 0.01,
217
+ "learning_rate": 3.904495770676831e-05,
218
+ "loss": 0.7298,
219
  "step": 170
220
  },
221
  {
222
+ "epoch": 0.01,
223
+ "learning_rate": 3.892296313751502e-05,
224
+ "loss": 0.7385,
225
  "step": 175
226
  },
227
  {
228
+ "epoch": 0.01,
229
+ "learning_rate": 3.879385241571817e-05,
230
+ "loss": 0.7541,
231
  "step": 180
232
  },
233
  {
234
+ "epoch": 0.01,
235
+ "eval_loss": 0.7222402691841125,
236
+ "eval_runtime": 118.1542,
237
+ "eval_samples_per_second": 16.927,
238
+ "eval_steps_per_second": 4.232,
239
+ "step": 180
240
  },
241
  {
242
+ "epoch": 0.01,
243
+ "learning_rate": 3.865767409464002e-05,
244
+ "loss": 0.7311,
245
+ "step": 185
246
  },
247
  {
248
+ "epoch": 0.01,
249
+ "learning_rate": 3.8514479385377813e-05,
250
+ "loss": 0.7376,
251
+ "step": 190
 
 
252
  },
253
  {
254
+ "epoch": 0.01,
255
+ "learning_rate": 3.8364322137605484e-05,
256
+ "loss": 0.727,
257
  "step": 195
258
  },
259
  {
260
+ "epoch": 0.01,
261
+ "learning_rate": 3.8207258819322936e-05,
262
+ "loss": 0.739,
263
  "step": 200
264
  },
265
  {
266
+ "epoch": 0.01,
267
+ "learning_rate": 3.804334849562076e-05,
268
+ "loss": 0.741,
269
  "step": 205
270
  },
271
  {
272
+ "epoch": 0.01,
273
+ "learning_rate": 3.787265280646825e-05,
274
+ "loss": 0.7216,
275
  "step": 210
276
  },
277
  {
278
+ "epoch": 0.01,
279
+ "learning_rate": 3.7695235943533155e-05,
280
+ "loss": 0.7115,
281
  "step": 215
282
  },
283
  {
284
+ "epoch": 0.01,
285
+ "learning_rate": 3.7511164626041823e-05,
286
+ "loss": 0.731,
287
  "step": 220
288
  },
289
  {
290
+ "epoch": 0.01,
291
+ "learning_rate": 3.732050807568878e-05,
292
+ "loss": 0.7185,
293
  "step": 225
294
  },
295
  {
296
+ "epoch": 0.01,
297
+ "learning_rate": 3.7123337990605335e-05,
298
+ "loss": 0.729,
299
  "step": 230
300
  },
301
  {
302
+ "epoch": 0.01,
303
+ "learning_rate": 3.691972851839682e-05,
304
+ "loss": 0.7485,
305
  "step": 235
306
  },
307
  {
308
+ "epoch": 0.01,
309
+ "learning_rate": 3.6709756228258735e-05,
310
+ "loss": 0.7344,
311
  "step": 240
312
  },
313
  {
314
+ "epoch": 0.01,
315
+ "learning_rate": 3.649350008218214e-05,
316
+ "loss": 0.7527,
317
  "step": 245
318
  },
319
  {
320
+ "epoch": 0.01,
321
+ "learning_rate": 3.6271041405259354e-05,
322
+ "loss": 0.7381,
323
  "step": 250
324
  },
325
  {
326
+ "epoch": 0.01,
327
+ "learning_rate": 3.604246385510088e-05,
328
+ "loss": 0.7512,
329
  "step": 255
330
  },
331
  {
332
+ "epoch": 0.01,
333
+ "learning_rate": 3.580785339037519e-05,
334
+ "loss": 0.7368,
335
  "step": 260
336
  },
337
  {
338
+ "epoch": 0.01,
339
+ "learning_rate": 3.5567298238483206e-05,
340
+ "loss": 0.7228,
341
  "step": 265
342
  },
343
  {
344
+ "epoch": 0.01,
345
+ "learning_rate": 3.532088886237956e-05,
346
+ "loss": 0.7361,
347
  "step": 270
348
  },
349
  {
350
+ "epoch": 0.01,
351
+ "learning_rate": 3.506871792655321e-05,
352
+ "loss": 0.727,
353
  "step": 275
354
  },
355
  {
356
+ "epoch": 0.01,
357
+ "learning_rate": 3.48108802621801e-05,
358
+ "loss": 0.7198,
359
  "step": 280
360
  },
361
  {
362
+ "epoch": 0.01,
363
+ "learning_rate": 3.4547472831460976e-05,
364
+ "loss": 0.736,
365
  "step": 285
366
  },
367
  {
368
+ "epoch": 0.01,
369
+ "learning_rate": 3.4278594691157985e-05,
370
+ "loss": 0.7552,
371
  "step": 290
372
  },
373
  {
374
+ "epoch": 0.01,
375
+ "learning_rate": 3.400434695534337e-05,
376
+ "loss": 0.7316,
377
  "step": 295
378
  },
379
  {
380
+ "epoch": 0.01,
381
+ "learning_rate": 3.372483275737468e-05,
382
+ "loss": 0.7235,
383
  "step": 300
384
  },
385
  {
386
+ "epoch": 0.01,
387
+ "learning_rate": 3.3440157211110454e-05,
388
+ "loss": 0.7189,
389
  "step": 305
390
  },
391
  {
392
+ "epoch": 0.01,
393
+ "learning_rate": 3.315042737138128e-05,
394
+ "loss": 0.7358,
395
  "step": 310
396
  },
397
  {
398
+ "epoch": 0.01,
399
+ "learning_rate": 3.285575219373079e-05,
400
+ "loss": 0.7252,
401
  "step": 315
402
  },
403
  {
404
+ "epoch": 0.01,
405
+ "learning_rate": 3.255624249344198e-05,
406
+ "loss": 0.7157,
407
  "step": 320
408
  },
409
  {
410
+ "epoch": 0.01,
411
+ "learning_rate": 3.2252010903864057e-05,
412
+ "loss": 0.7175,
413
  "step": 325
414
  },
415
  {
416
+ "epoch": 0.01,
417
+ "learning_rate": 3.194317183405573e-05,
418
+ "loss": 0.7417,
419
  "step": 330
420
  },
421
  {
422
+ "epoch": 0.01,
423
+ "learning_rate": 3.1629841425760534e-05,
424
+ "loss": 0.736,
425
  "step": 335
426
  },
427
  {
428
+ "epoch": 0.01,
429
+ "learning_rate": 3.1312137509730776e-05,
430
+ "loss": 0.7194,
431
  "step": 340
432
  },
433
  {
434
+ "epoch": 0.01,
435
+ "learning_rate": 3.0990179561416124e-05,
436
+ "loss": 0.7222,
437
  "step": 345
438
  },
439
  {
440
+ "epoch": 0.01,
441
+ "learning_rate": 3.066408865603383e-05,
442
+ "loss": 0.7196,
443
  "step": 350
444
  },
445
  {
446
+ "epoch": 0.01,
447
+ "learning_rate": 3.0333987423037262e-05,
448
+ "loss": 0.714,
449
  "step": 355
450
  },
451
  {
452
+ "epoch": 0.01,
453
+ "learning_rate": 3.0000000000000004e-05,
454
+ "loss": 0.7126,
455
+ "step": 360
456
+ },
457
+ {
458
+ "epoch": 0.01,
459
+ "eval_loss": 0.7118021249771118,
460
+ "eval_runtime": 117.7706,
461
+ "eval_samples_per_second": 16.982,
462
+ "eval_steps_per_second": 4.246,
463
  "step": 360
464
  },
465
  {
466
+ "epoch": 0.01,
467
+ "learning_rate": 2.9662251985932773e-05,
468
+ "loss": 0.7159,
469
  "step": 365
470
  },
471
  {
472
+ "epoch": 0.01,
473
+ "learning_rate": 2.9320870394050783e-05,
474
+ "loss": 0.7249,
475
  "step": 370
476
  },
477
  {
478
+ "epoch": 0.01,
479
+ "learning_rate": 2.897598360400925e-05,
480
+ "loss": 0.7182,
481
  "step": 375
482
  },
483
  {
484
+ "epoch": 0.01,
485
+ "learning_rate": 2.8627721313625073e-05,
486
+ "loss": 0.7083,
487
  "step": 380
488
  },
489
  {
490
+ "epoch": 0.02,
491
+ "learning_rate": 2.8276214490102788e-05,
492
+ "loss": 0.7238,
 
 
 
 
 
 
 
 
493
  "step": 385
494
  },
495
  {
496
+ "epoch": 0.02,
497
+ "learning_rate": 2.792159532078314e-05,
498
+ "loss": 0.7137,
499
  "step": 390
500
  },
501
  {
502
+ "epoch": 0.02,
503
+ "learning_rate": 2.7563997163432853e-05,
504
+ "loss": 0.7182,
505
  "step": 395
506
  },
507
  {
508
+ "epoch": 0.02,
509
+ "learning_rate": 2.720355449609421e-05,
510
+ "loss": 0.7285,
511
  "step": 400
512
  },
513
  {
514
+ "epoch": 0.02,
515
+ "learning_rate": 2.684040286651338e-05,
516
+ "loss": 0.6988,
517
  "step": 405
518
  },
519
  {
520
+ "epoch": 0.02,
521
+ "learning_rate": 2.6474678841166426e-05,
522
+ "loss": 0.7217,
523
  "step": 410
524
  },
525
  {
526
+ "epoch": 0.02,
527
+ "learning_rate": 2.6106519953902268e-05,
528
+ "loss": 0.7185,
529
  "step": 415
530
  },
531
  {
532
+ "epoch": 0.02,
533
+ "learning_rate": 2.5736064654221808e-05,
534
+ "loss": 0.7303,
535
  "step": 420
536
  },
537
  {
538
+ "epoch": 0.02,
539
+ "learning_rate": 2.536345225521275e-05,
540
+ "loss": 0.7134,
541
  "step": 425
542
  },
543
  {
544
+ "epoch": 0.02,
545
+ "learning_rate": 2.4988822881159627e-05,
546
+ "loss": 0.7215,
547
  "step": 430
548
  },
549
  {
550
+ "epoch": 0.02,
551
+ "learning_rate": 2.4612317414848804e-05,
552
+ "loss": 0.7219,
553
  "step": 435
554
  },
555
  {
556
+ "epoch": 0.02,
557
+ "learning_rate": 2.423407744458822e-05,
558
+ "loss": 0.7253,
559
  "step": 440
560
  },
561
  {
562
+ "epoch": 0.02,
563
+ "learning_rate": 2.3854245210961798e-05,
564
+ "loss": 0.7251,
565
  "step": 445
566
  },
567
  {
568
+ "epoch": 0.02,
569
+ "learning_rate": 2.3472963553338614e-05,
570
+ "loss": 0.7259,
571
  "step": 450
572
  },
573
  {
574
+ "epoch": 0.02,
575
+ "learning_rate": 2.3090375856156813e-05,
576
+ "loss": 0.7412,
577
  "step": 455
578
  },
579
  {
580
+ "epoch": 0.02,
581
+ "learning_rate": 2.2706625995002626e-05,
582
+ "loss": 0.7316,
583
  "step": 460
584
  },
585
  {
586
+ "epoch": 0.02,
587
+ "learning_rate": 2.2321858282504606e-05,
588
+ "loss": 0.7166,
589
  "step": 465
590
  },
591
  {
592
+ "epoch": 0.02,
593
+ "learning_rate": 2.1936217414063584e-05,
594
+ "loss": 0.7453,
595
  "step": 470
596
  },
597
  {
598
+ "epoch": 0.02,
599
+ "learning_rate": 2.154984841343862e-05,
600
+ "loss": 0.7135,
601
  "step": 475
602
  },
603
  {
604
+ "epoch": 0.02,
605
+ "learning_rate": 2.1162896578209517e-05,
606
+ "loss": 0.712,
607
  "step": 480
608
  },
609
  {
610
+ "epoch": 0.02,
611
+ "learning_rate": 2.077550742513634e-05,
612
+ "loss": 0.7185,
613
  "step": 485
614
  },
615
  {
616
+ "epoch": 0.02,
617
+ "learning_rate": 2.038782663543649e-05,
618
+ "loss": 0.7366,
619
  "step": 490
620
  },
621
  {
622
+ "epoch": 0.02,
623
+ "learning_rate": 2e-05,
624
+ "loss": 0.7302,
625
  "step": 495
626
  },
627
  {
628
+ "epoch": 0.02,
629
+ "learning_rate": 1.9612173364563517e-05,
630
+ "loss": 0.7393,
631
  "step": 500
632
  },
633
  {
634
+ "epoch": 0.02,
635
+ "learning_rate": 1.9224492574863663e-05,
636
+ "loss": 0.7296,
637
  "step": 505
638
  },
639
  {
640
+ "epoch": 0.02,
641
+ "learning_rate": 1.8837103421790486e-05,
642
+ "loss": 0.7109,
643
  "step": 510
644
  },
645
  {
646
+ "epoch": 0.02,
647
+ "learning_rate": 1.8450151586561386e-05,
648
+ "loss": 0.7092,
649
  "step": 515
650
  },
651
  {
652
+ "epoch": 0.02,
653
+ "learning_rate": 1.806378258593642e-05,
654
+ "loss": 0.7381,
655
  "step": 520
656
  },
657
  {
658
+ "epoch": 0.02,
659
+ "learning_rate": 1.7678141717495394e-05,
660
+ "loss": 0.733,
661
  "step": 525
662
  },
663
  {
664
+ "epoch": 0.02,
665
+ "learning_rate": 1.7293374004997384e-05,
666
+ "loss": 0.7068,
667
  "step": 530
668
  },
669
  {
670
+ "epoch": 0.02,
671
+ "learning_rate": 1.6909624143843193e-05,
672
+ "loss": 0.7062,
673
  "step": 535
674
  },
675
  {
676
+ "epoch": 0.02,
677
+ "learning_rate": 1.6527036446661396e-05,
678
+ "loss": 0.7342,
679
+ "step": 540
680
+ },
681
+ {
682
+ "epoch": 0.02,
683
+ "eval_loss": 0.7100361585617065,
684
+ "eval_runtime": 117.785,
685
+ "eval_samples_per_second": 16.98,
686
+ "eval_steps_per_second": 4.245,
687
  "step": 540
688
  },
689
  {
690
+ "epoch": 0.02,
691
+ "learning_rate": 1.6145754789038205e-05,
692
+ "loss": 0.7193,
693
  "step": 545
694
  },
695
  {
696
+ "epoch": 0.02,
697
+ "learning_rate": 1.5765922555411793e-05,
698
+ "loss": 0.7087,
699
  "step": 550
700
  },
701
  {
702
+ "epoch": 0.02,
703
+ "learning_rate": 1.53876825851512e-05,
704
+ "loss": 0.7228,
705
  "step": 555
706
  },
707
  {
708
+ "epoch": 0.02,
709
+ "learning_rate": 1.5011177118840376e-05,
710
+ "loss": 0.7254,
711
  "step": 560
712
  },
713
  {
714
+ "epoch": 0.02,
715
+ "learning_rate": 1.4636547744787251e-05,
716
+ "loss": 0.732,
717
  "step": 565
718
  },
719
  {
720
+ "epoch": 0.02,
721
+ "learning_rate": 1.4263935345778202e-05,
722
+ "loss": 0.7186,
723
  "step": 570
724
  },
725
  {
726
+ "epoch": 0.02,
727
+ "learning_rate": 1.389348004609774e-05,
728
+ "loss": 0.7091,
729
  "step": 575
730
  },
731
  {
732
+ "epoch": 0.02,
733
+ "learning_rate": 1.3525321158833582e-05,
734
+ "loss": 0.7278,
 
 
 
 
 
 
 
 
735
  "step": 580
736
  },
737
  {
738
+ "epoch": 0.02,
739
+ "learning_rate": 1.3159597133486628e-05,
740
+ "loss": 0.7088,
741
  "step": 585
742
  },
743
  {
744
+ "epoch": 0.02,
745
+ "learning_rate": 1.2796445503905797e-05,
746
+ "loss": 0.7188,
747
  "step": 590
748
  },
749
  {
750
+ "epoch": 0.02,
751
+ "learning_rate": 1.2436002836567154e-05,
752
+ "loss": 0.7311,
753
  "step": 595
754
  },
755
  {
756
+ "epoch": 0.02,
757
+ "learning_rate": 1.2078404679216864e-05,
758
+ "loss": 0.727,
759
  "step": 600
760
  },
761
  {
762
+ "epoch": 0.02,
763
+ "learning_rate": 1.1723785509897219e-05,
764
+ "loss": 0.7089,
765
  "step": 605
766
  },
767
  {
768
+ "epoch": 0.02,
769
+ "learning_rate": 1.1372278686374935e-05,
770
+ "loss": 0.7108,
771
  "step": 610
772
  },
773
  {
774
+ "epoch": 0.02,
775
+ "learning_rate": 1.1024016395990758e-05,
776
+ "loss": 0.7189,
777
  "step": 615
778
  },
779
  {
780
+ "epoch": 0.02,
781
+ "learning_rate": 1.067912960594923e-05,
782
+ "loss": 0.717,
783
  "step": 620
784
  },
785
  {
786
+ "epoch": 0.02,
787
+ "learning_rate": 1.033774801406723e-05,
788
+ "loss": 0.713,
789
  "step": 625
790
  },
791
  {
792
+ "epoch": 0.02,
793
+ "learning_rate": 1.0000000000000006e-05,
794
+ "loss": 0.7217,
795
  "step": 630
796
  },
797
  {
798
+ "epoch": 0.03,
799
+ "learning_rate": 9.666012576962743e-06,
800
+ "loss": 0.7099,
801
  "step": 635
802
  },
803
  {
804
+ "epoch": 0.03,
805
+ "learning_rate": 9.33591134396618e-06,
806
+ "loss": 0.6947,
807
  "step": 640
808
  },
809
  {
810
+ "epoch": 0.03,
811
+ "learning_rate": 9.009820438583881e-06,
812
+ "loss": 0.7051,
813
  "step": 645
814
  },
815
  {
816
+ "epoch": 0.03,
817
+ "learning_rate": 8.687862490269232e-06,
818
+ "loss": 0.7244,
819
  "step": 650
820
  },
821
  {
822
+ "epoch": 0.03,
823
+ "learning_rate": 8.370158574239466e-06,
824
+ "loss": 0.7233,
825
  "step": 655
826
  },
827
  {
828
+ "epoch": 0.03,
829
+ "learning_rate": 8.056828165944282e-06,
830
+ "loss": 0.712,
831
  "step": 660
832
  },
833
  {
834
+ "epoch": 0.03,
835
+ "learning_rate": 7.747989096135943e-06,
836
+ "loss": 0.7236,
837
  "step": 665
838
  },
839
  {
840
+ "epoch": 0.03,
841
+ "learning_rate": 7.443757506558033e-06,
842
+ "loss": 0.7134,
843
  "step": 670
844
  },
845
  {
846
+ "epoch": 0.03,
847
+ "learning_rate": 7.1442478062692135e-06,
848
+ "loss": 0.714,
849
  "step": 675
850
  },
851
  {
852
+ "epoch": 0.03,
853
+ "learning_rate": 6.84957262861873e-06,
854
+ "loss": 0.7058,
855
  "step": 680
856
  },
857
  {
858
+ "epoch": 0.03,
859
+ "learning_rate": 6.559842788889552e-06,
860
+ "loss": 0.7265,
861
  "step": 685
862
  },
863
  {
864
+ "epoch": 0.03,
865
+ "learning_rate": 6.275167242625331e-06,
866
+ "loss": 0.7119,
867
  "step": 690
868
  },
869
  {
870
+ "epoch": 0.03,
871
+ "learning_rate": 5.9956530446566305e-06,
872
+ "loss": 0.7263,
873
  "step": 695
874
  },
875
  {
876
+ "epoch": 0.03,
877
+ "learning_rate": 5.721405308842023e-06,
878
+ "loss": 0.7274,
879
  "step": 700
880
  },
881
  {
882
+ "epoch": 0.03,
883
+ "learning_rate": 5.452527168539026e-06,
884
+ "loss": 0.7166,
885
  "step": 705
886
  },
887
  {
888
+ "epoch": 0.03,
889
+ "learning_rate": 5.189119737819912e-06,
890
+ "loss": 0.713,
891
  "step": 710
892
  },
893
  {
894
+ "epoch": 0.03,
895
+ "learning_rate": 4.9312820734467855e-06,
896
+ "loss": 0.7253,
897
  "step": 715
898
  },
899
  {
900
+ "epoch": 0.03,
901
+ "learning_rate": 4.679111137620442e-06,
902
+ "loss": 0.7216,
903
  "step": 720
904
  },
905
  {
906
+ "epoch": 0.03,
907
+ "eval_loss": 0.708259105682373,
908
+ "eval_runtime": 117.7068,
909
+ "eval_samples_per_second": 16.991,
910
+ "eval_steps_per_second": 4.248,
911
+ "step": 720
912
+ },
913
+ {
914
+ "epoch": 0.03,
915
+ "learning_rate": 4.4327017615168e-06,
916
+ "loss": 0.7165,
917
  "step": 725
918
  },
919
  {
920
+ "epoch": 0.03,
921
+ "learning_rate": 4.1921466096248164e-06,
922
+ "loss": 0.7303,
923
  "step": 730
924
  },
925
  {
926
+ "epoch": 0.03,
927
+ "learning_rate": 3.957536144899123e-06,
928
+ "loss": 0.7198,
929
  "step": 735
930
  },
931
  {
932
+ "epoch": 0.03,
933
+ "learning_rate": 3.7289585947406504e-06,
934
+ "loss": 0.7186,
935
  "step": 740
936
  },
937
  {
938
+ "epoch": 0.03,
939
+ "learning_rate": 3.5064999178178648e-06,
940
+ "loss": 0.7438,
941
  "step": 745
942
  },
943
  {
944
+ "epoch": 0.03,
945
+ "learning_rate": 3.290243771741275e-06,
946
+ "loss": 0.7191,
947
  "step": 750
948
  },
949
  {
950
+ "epoch": 0.03,
951
+ "learning_rate": 3.0802714816031787e-06,
952
+ "loss": 0.7132,
953
  "step": 755
954
  },
955
  {
956
+ "epoch": 0.03,
957
+ "learning_rate": 2.876662009394673e-06,
958
+ "loss": 0.7278,
959
  "step": 760
960
  },
961
  {
962
+ "epoch": 0.03,
963
+ "learning_rate": 2.679491924311226e-06,
964
+ "loss": 0.7085,
965
  "step": 765
966
  },
967
  {
968
+ "epoch": 0.03,
969
+ "learning_rate": 2.488835373958185e-06,
970
+ "loss": 0.7115,
 
 
 
 
 
 
 
 
971
  "step": 770
972
  },
973
  {
974
+ "epoch": 0.03,
975
+ "learning_rate": 2.304764056466844e-06,
976
+ "loss": 0.7118,
977
  "step": 775
978
  },
979
  {
980
+ "epoch": 0.03,
981
+ "learning_rate": 2.127347193531757e-06,
982
+ "loss": 0.7121,
983
  "step": 780
984
  },
985
  {
986
+ "epoch": 0.03,
987
+ "learning_rate": 1.9566515043792455e-06,
988
+ "loss": 0.7174,
989
  "step": 785
990
  },
991
  {
992
+ "epoch": 0.03,
993
+ "learning_rate": 1.792741180677069e-06,
994
+ "loss": 0.6913,
995
  "step": 790
996
  },
997
  {
998
+ "epoch": 0.03,
999
+ "learning_rate": 1.6356778623945223e-06,
1000
+ "loss": 0.7038,
1001
  "step": 795
1002
  },
1003
  {
1004
+ "epoch": 0.03,
1005
+ "learning_rate": 1.4855206146221934e-06,
1006
+ "loss": 0.7156,
1007
  "step": 800
1008
  },
1009
  {
1010
+ "epoch": 0.03,
1011
+ "learning_rate": 1.3423259053599891e-06,
1012
+ "loss": 0.7113,
1013
  "step": 805
1014
  },
1015
  {
1016
+ "epoch": 0.03,
1017
+ "learning_rate": 1.2061475842818337e-06,
1018
+ "loss": 0.727,
1019
  "step": 810
1020
  },
1021
  {
1022
+ "epoch": 0.03,
1023
+ "learning_rate": 1.0770368624849947e-06,
1024
+ "loss": 0.7186,
1025
  "step": 815
1026
  },
1027
  {
1028
+ "epoch": 0.03,
1029
+ "learning_rate": 9.550422932316938e-07,
1030
+ "loss": 0.7088,
1031
  "step": 820
1032
  },
1033
  {
1034
+ "epoch": 0.03,
1035
+ "learning_rate": 8.402097536902221e-07,
1036
+ "loss": 0.7125,
1037
  "step": 825
1038
  },
1039
  {
1040
+ "epoch": 0.03,
1041
+ "learning_rate": 7.325824276823934e-07,
1042
+ "loss": 0.7076,
1043
  "step": 830
1044
  },
1045
  {
1046
+ "epoch": 0.03,
1047
+ "learning_rate": 6.322007894438842e-07,
1048
+ "loss": 0.7099,
1049
  "step": 835
1050
  },
1051
  {
1052
+ "epoch": 0.03,
1053
+ "learning_rate": 5.391025884035239e-07,
1054
+ "loss": 0.7238,
1055
  "step": 840
1056
  },
1057
  {
1058
+ "epoch": 0.03,
1059
+ "learning_rate": 4.533228349872887e-07,
1060
+ "loss": 0.7241,
1061
  "step": 845
1062
  },
1063
  {
1064
+ "epoch": 0.03,
1065
+ "learning_rate": 3.748937874523062e-07,
1066
+ "loss": 0.7248,
1067
  "step": 850
1068
  },
1069
  {
1070
+ "epoch": 0.03,
1071
+ "learning_rate": 3.038449397558396e-07,
1072
+ "loss": 0.7083,
1073
  "step": 855
1074
  },
1075
  {
1076
+ "epoch": 0.03,
1077
+ "learning_rate": 2.402030104638198e-07,
1078
+ "loss": 0.7167,
1079
  "step": 860
1080
  },
1081
  {
1082
+ "epoch": 0.03,
1083
+ "learning_rate": 1.839919327030937e-07,
1084
+ "loss": 0.7277,
1085
  "step": 865
1086
  },
1087
  {
1088
+ "epoch": 0.03,
1089
+ "learning_rate": 1.3523284516113955e-07,
1090
+ "loss": 0.7075,
1091
  "step": 870
1092
  },
1093
  {
1094
+ "epoch": 0.03,
1095
+ "learning_rate": 9.394408413668343e-08,
1096
+ "loss": 0.7134,
1097
  "step": 875
1098
  },
1099
  {
1100
+ "epoch": 0.03,
1101
+ "learning_rate": 6.014117664415953e-08,
1102
+ "loss": 0.7147,
1103
  "step": 880
1104
  },
1105
  {
1106
+ "epoch": 0.03,
1107
+ "learning_rate": 3.383683457463649e-08,
1108
+ "loss": 0.7287,
1109
  "step": 885
1110
  },
1111
  {
1112
+ "epoch": 0.04,
1113
+ "learning_rate": 1.5040949915399173e-08,
1114
+ "loss": 0.7152,
1115
  "step": 890
1116
  },
1117
  {
1118
+ "epoch": 0.04,
1119
+ "learning_rate": 3.760591029973171e-09,
1120
+ "loss": 0.7195,
1121
  "step": 895
1122
  },
1123
  {
1124
+ "epoch": 0.04,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1125
  "learning_rate": 0.0,
1126
+ "loss": 0.7171,
1127
+ "step": 900
1128
  },
1129
  {
1130
+ "epoch": 0.04,
1131
+ "eval_loss": 0.7084137797355652,
1132
+ "eval_runtime": 117.7342,
1133
+ "eval_samples_per_second": 16.987,
1134
+ "eval_steps_per_second": 4.247,
1135
+ "step": 900
1136
  },
1137
  {
1138
+ "epoch": 0.04,
1139
+ "step": 900,
1140
+ "total_flos": 1.1929120479106826e+18,
1141
+ "train_loss": 0.7602509791321225,
1142
+ "train_runtime": 22175.0023,
1143
+ "train_samples_per_second": 0.649,
1144
  "train_steps_per_second": 0.041
1145
  }
1146
  ],
1147
  "logging_steps": 5,
1148
+ "max_steps": 900,
1149
+ "num_train_epochs": 1,
1150
+ "save_steps": 180,
1151
+ "total_flos": 1.1929120479106826e+18,
1152
  "trial_name": null,
1153
  "trial_params": null
1154
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8e4293e4b07049ac9ae8d94f77930f2db536f10de6e8e326cdf856056cf94f80
3
  size 4728
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f65537292be9c1a03c759b840c2f8b9421e85df46af77ce90fdb8f69e1b17890
3
  size 4728