|
{ |
|
"best_metric": 1.1526199579238892, |
|
"best_model_checkpoint": "data/tinyllama_moe_sft_ultrachat-slimorca/checkpoint-2000", |
|
"epoch": 0.9997585124366095, |
|
"eval_steps": 100, |
|
"global_step": 2070, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 1.6666666666666668e-07, |
|
"loss": 2.9792, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 8.333333333333333e-07, |
|
"loss": 2.9452, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 1.6666666666666667e-06, |
|
"loss": 2.9114, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 2.5e-06, |
|
"loss": 2.8843, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 2.6761, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 4.166666666666667e-06, |
|
"loss": 2.3539, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 5e-06, |
|
"loss": 2.1166, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 5.833333333333334e-06, |
|
"loss": 2.0101, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 1.8725, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 1.7979, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 8.333333333333334e-06, |
|
"loss": 1.7447, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 9.166666666666666e-06, |
|
"loss": 1.6985, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 1e-05, |
|
"loss": 1.6134, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 1.0833333333333334e-05, |
|
"loss": 1.6174, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 1.1666666666666668e-05, |
|
"loss": 1.5786, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 1.25e-05, |
|
"loss": 1.5374, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 1.523, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 1.416666666666667e-05, |
|
"loss": 1.5384, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 1.5000000000000002e-05, |
|
"loss": 1.4798, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 1.5833333333333333e-05, |
|
"loss": 1.4584, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 1.4601, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 1.3360612392425537, |
|
"eval_runtime": 428.8225, |
|
"eval_samples_per_second": 37.699, |
|
"eval_steps_per_second": 1.18, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 1.7500000000000002e-05, |
|
"loss": 1.4256, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 1.8333333333333333e-05, |
|
"loss": 1.4164, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 1.916666666666667e-05, |
|
"loss": 1.427, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 2e-05, |
|
"loss": 1.4041, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 1.9999675557165282e-05, |
|
"loss": 1.404, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 1.9998702249713747e-05, |
|
"loss": 1.4163, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 1.9997080140801932e-05, |
|
"loss": 1.3984, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 1.9994809335686152e-05, |
|
"loss": 1.3717, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 1.9991889981715696e-05, |
|
"loss": 1.3905, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 1.998832226832327e-05, |
|
"loss": 1.3486, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 1.9984106427012667e-05, |
|
"loss": 1.3748, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 1.9979242731343803e-05, |
|
"loss": 1.3651, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 1.9973731496914914e-05, |
|
"loss": 1.3452, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 1.9967573081342103e-05, |
|
"loss": 1.3533, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 1.9960767884236132e-05, |
|
"loss": 1.3422, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 1.995331634717649e-05, |
|
"loss": 1.3161, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 1.9945218953682736e-05, |
|
"loss": 1.341, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 1.9936476229183133e-05, |
|
"loss": 1.3373, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 1.992708874098054e-05, |
|
"loss": 1.3365, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 1.9917057098215624e-05, |
|
"loss": 1.3324, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"eval_loss": 1.2566393613815308, |
|
"eval_runtime": 585.8351, |
|
"eval_samples_per_second": 27.595, |
|
"eval_steps_per_second": 0.864, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 1.9906381951827295e-05, |
|
"loss": 1.3309, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 1.9895063994510512e-05, |
|
"loss": 1.3236, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 1.9883103960671305e-05, |
|
"loss": 1.3254, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 1.9870502626379127e-05, |
|
"loss": 1.3107, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 1.985726080931651e-05, |
|
"loss": 1.3345, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 1.9843379368725978e-05, |
|
"loss": 1.3303, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 1.9828859205354326e-05, |
|
"loss": 1.3179, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 1.9813701261394136e-05, |
|
"loss": 1.2992, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 1.979790652042268e-05, |
|
"loss": 1.3117, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 1.9781476007338058e-05, |
|
"loss": 1.3035, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 1.9764410788292724e-05, |
|
"loss": 1.2918, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 1.9746711970624282e-05, |
|
"loss": 1.3105, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 1.9728380702783644e-05, |
|
"loss": 1.3266, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 1.9709418174260523e-05, |
|
"loss": 1.3068, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 1.968982561550621e-05, |
|
"loss": 1.3045, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 1.9669604297853766e-05, |
|
"loss": 1.3042, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 1.9648755533435517e-05, |
|
"loss": 1.3033, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 1.962728067509791e-05, |
|
"loss": 1.2891, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 1.9605181116313725e-05, |
|
"loss": 1.2984, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 1.9582458291091664e-05, |
|
"loss": 1.2946, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"eval_loss": 1.2279455661773682, |
|
"eval_runtime": 426.1287, |
|
"eval_samples_per_second": 37.937, |
|
"eval_steps_per_second": 1.187, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 1.955911367388329e-05, |
|
"loss": 1.2973, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 1.9535148779487365e-05, |
|
"loss": 1.2933, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 1.9510565162951538e-05, |
|
"loss": 1.2931, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 1.9485364419471454e-05, |
|
"loss": 1.2918, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 1.9459548184287254e-05, |
|
"loss": 1.2965, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 1.9433118132577432e-05, |
|
"loss": 1.2924, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 1.9406075979350175e-05, |
|
"loss": 1.3012, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 1.9378423479332045e-05, |
|
"loss": 1.2948, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 1.9350162426854152e-05, |
|
"loss": 1.2708, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 1.932129465573568e-05, |
|
"loss": 1.2749, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 1.9291822039164934e-05, |
|
"loss": 1.2849, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 1.9261746489577767e-05, |
|
"loss": 1.2926, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 1.923106995853349e-05, |
|
"loss": 1.2743, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 1.9199794436588244e-05, |
|
"loss": 1.2767, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 1.9167921953165827e-05, |
|
"loss": 1.2673, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 1.913545457642601e-05, |
|
"loss": 1.2751, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 1.9102394413130348e-05, |
|
"loss": 1.2782, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 1.9068743608505454e-05, |
|
"loss": 1.2673, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 1.9034504346103825e-05, |
|
"loss": 1.2675, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 1.8999678847662124e-05, |
|
"loss": 1.2767, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"eval_loss": 1.2110730409622192, |
|
"eval_runtime": 424.7414, |
|
"eval_samples_per_second": 38.061, |
|
"eval_steps_per_second": 1.191, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 1.896426937295704e-05, |
|
"loss": 1.2654, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 1.892827821965864e-05, |
|
"loss": 1.2771, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 1.8891707723181294e-05, |
|
"loss": 1.2757, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 1.8854560256532098e-05, |
|
"loss": 1.2621, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 1.881683823015694e-05, |
|
"loss": 1.2785, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 1.8778544091784047e-05, |
|
"loss": 1.252, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 1.873968032626518e-05, |
|
"loss": 1.2634, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 1.8700249455414394e-05, |
|
"loss": 1.2811, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 1.866025403784439e-05, |
|
"loss": 1.2621, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"learning_rate": 1.8619696668800494e-05, |
|
"loss": 1.266, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"learning_rate": 1.8578579979992266e-05, |
|
"loss": 1.2579, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"learning_rate": 1.8536906639422724e-05, |
|
"loss": 1.2456, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"learning_rate": 1.8494679351215212e-05, |
|
"loss": 1.236, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"learning_rate": 1.845190085543795e-05, |
|
"loss": 1.2619, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"learning_rate": 1.8408573927926225e-05, |
|
"loss": 1.2551, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"learning_rate": 1.8364701380102267e-05, |
|
"loss": 1.2534, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"learning_rate": 1.8320286058792845e-05, |
|
"loss": 1.2637, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"learning_rate": 1.82753308460445e-05, |
|
"loss": 1.2571, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"learning_rate": 1.8229838658936566e-05, |
|
"loss": 1.2657, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"learning_rate": 1.818381244939187e-05, |
|
"loss": 1.2298, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"eval_loss": 1.1995348930358887, |
|
"eval_runtime": 424.3499, |
|
"eval_samples_per_second": 38.096, |
|
"eval_steps_per_second": 1.192, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"learning_rate": 1.81372552039852e-05, |
|
"loss": 1.2547, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"learning_rate": 1.8090169943749477e-05, |
|
"loss": 1.2452, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"learning_rate": 1.804255972397977e-05, |
|
"loss": 1.246, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"learning_rate": 1.7994427634035016e-05, |
|
"loss": 1.2402, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"learning_rate": 1.7945776797137544e-05, |
|
"loss": 1.2531, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"learning_rate": 1.7896610370170452e-05, |
|
"loss": 1.243, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"learning_rate": 1.7846931543472722e-05, |
|
"loss": 1.2344, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"learning_rate": 1.7796743540632226e-05, |
|
"loss": 1.2555, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"learning_rate": 1.7746049618276545e-05, |
|
"loss": 1.2283, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"learning_rate": 1.769485306586166e-05, |
|
"loss": 1.235, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"learning_rate": 1.7643157205458483e-05, |
|
"loss": 1.2255, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"learning_rate": 1.7590965391537316e-05, |
|
"loss": 1.2409, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"learning_rate": 1.753828101075017e-05, |
|
"loss": 1.2303, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"learning_rate": 1.7485107481711014e-05, |
|
"loss": 1.2293, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"learning_rate": 1.7431448254773943e-05, |
|
"loss": 1.2354, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"learning_rate": 1.7377306811809306e-05, |
|
"loss": 1.2277, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"learning_rate": 1.7322686665977738e-05, |
|
"loss": 1.2437, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"learning_rate": 1.7267591361502233e-05, |
|
"loss": 1.2332, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"learning_rate": 1.7212024473438145e-05, |
|
"loss": 1.2539, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"learning_rate": 1.715598960744121e-05, |
|
"loss": 1.2247, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"eval_loss": 1.190222978591919, |
|
"eval_runtime": 425.341, |
|
"eval_samples_per_second": 38.007, |
|
"eval_steps_per_second": 1.19, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"learning_rate": 1.7099490399533583e-05, |
|
"loss": 1.2454, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"learning_rate": 1.7042530515867897e-05, |
|
"loss": 1.2263, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"learning_rate": 1.6985113652489374e-05, |
|
"loss": 1.2203, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"learning_rate": 1.6927243535095995e-05, |
|
"loss": 1.2256, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"learning_rate": 1.6868923918796753e-05, |
|
"loss": 1.236, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"learning_rate": 1.6810158587867973e-05, |
|
"loss": 1.244, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"learning_rate": 1.6750951355507763e-05, |
|
"loss": 1.2408, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"learning_rate": 1.6691306063588583e-05, |
|
"loss": 1.216, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"learning_rate": 1.6631226582407954e-05, |
|
"loss": 1.2315, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"learning_rate": 1.657071681043731e-05, |
|
"loss": 1.2348, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"learning_rate": 1.650978067406904e-05, |
|
"loss": 1.2338, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"learning_rate": 1.6448422127361707e-05, |
|
"loss": 1.2434, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"learning_rate": 1.638664515178348e-05, |
|
"loss": 1.2237, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"learning_rate": 1.6324453755953772e-05, |
|
"loss": 1.2026, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"learning_rate": 1.626185197538314e-05, |
|
"loss": 1.2498, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"learning_rate": 1.6198843872211404e-05, |
|
"loss": 1.2291, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"learning_rate": 1.613543353494409e-05, |
|
"loss": 1.2269, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"learning_rate": 1.6071625078187113e-05, |
|
"loss": 1.2363, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"learning_rate": 1.600742264237979e-05, |
|
"loss": 1.207, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"learning_rate": 1.5942830393526176e-05, |
|
"loss": 1.2208, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"eval_loss": 1.18331778049469, |
|
"eval_runtime": 427.6678, |
|
"eval_samples_per_second": 37.8, |
|
"eval_steps_per_second": 1.183, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"learning_rate": 1.5877852522924733e-05, |
|
"loss": 1.2137, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"learning_rate": 1.5812493246896368e-05, |
|
"loss": 1.2171, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"learning_rate": 1.574675680651084e-05, |
|
"loss": 1.2311, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"learning_rate": 1.568064746731156e-05, |
|
"loss": 1.2106, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"learning_rate": 1.561416951903881e-05, |
|
"loss": 1.2061, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"learning_rate": 1.554732727535139e-05, |
|
"loss": 1.2039, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"learning_rate": 1.5480125073546705e-05, |
|
"loss": 1.1872, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"learning_rate": 1.5412567274279316e-05, |
|
"loss": 1.2143, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"learning_rate": 1.5344658261278013e-05, |
|
"loss": 1.21, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"learning_rate": 1.527640244106133e-05, |
|
"loss": 1.198, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"learning_rate": 1.5207804242651625e-05, |
|
"loss": 1.2096, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"learning_rate": 1.5138868117287689e-05, |
|
"loss": 1.2292, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"learning_rate": 1.5069598538135905e-05, |
|
"loss": 1.2208, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"learning_rate": 1.5000000000000002e-05, |
|
"loss": 1.2113, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"learning_rate": 1.4930077019029376e-05, |
|
"loss": 1.2139, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"learning_rate": 1.485983413242606e-05, |
|
"loss": 1.2268, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"learning_rate": 1.4789275898150309e-05, |
|
"loss": 1.2414, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"learning_rate": 1.471840689462482e-05, |
|
"loss": 1.2141, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"learning_rate": 1.4647231720437687e-05, |
|
"loss": 1.1926, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"learning_rate": 1.4575754994043956e-05, |
|
"loss": 1.2375, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"eval_loss": 1.1774698495864868, |
|
"eval_runtime": 422.1872, |
|
"eval_samples_per_second": 38.291, |
|
"eval_steps_per_second": 1.199, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"learning_rate": 1.450398135346597e-05, |
|
"loss": 1.2201, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"learning_rate": 1.4431915455992416e-05, |
|
"loss": 1.2204, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"learning_rate": 1.4359561977876102e-05, |
|
"loss": 1.2133, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"learning_rate": 1.4286925614030542e-05, |
|
"loss": 1.2086, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"learning_rate": 1.4214011077725293e-05, |
|
"loss": 1.2039, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"learning_rate": 1.414082310028012e-05, |
|
"loss": 1.1965, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"learning_rate": 1.4067366430758004e-05, |
|
"loss": 1.2217, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"learning_rate": 1.3993645835656955e-05, |
|
"loss": 1.231, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"learning_rate": 1.3919666098600753e-05, |
|
"loss": 1.206, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"learning_rate": 1.3845432020028511e-05, |
|
"loss": 1.2025, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"learning_rate": 1.3770948416883205e-05, |
|
"loss": 1.2339, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"learning_rate": 1.369622012229911e-05, |
|
"loss": 1.2021, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"learning_rate": 1.362125198528817e-05, |
|
"loss": 1.2036, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"learning_rate": 1.3546048870425356e-05, |
|
"loss": 1.2242, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"learning_rate": 1.347061565753303e-05, |
|
"loss": 1.2259, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"learning_rate": 1.3394957241364273e-05, |
|
"loss": 1.1964, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"learning_rate": 1.3319078531285286e-05, |
|
"loss": 1.2042, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"learning_rate": 1.3242984450956829e-05, |
|
"loss": 1.2158, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"learning_rate": 1.3166679938014728e-05, |
|
"loss": 1.2052, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"learning_rate": 1.3090169943749475e-05, |
|
"loss": 1.2038, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"eval_loss": 1.1725637912750244, |
|
"eval_runtime": 421.5684, |
|
"eval_samples_per_second": 38.347, |
|
"eval_steps_per_second": 1.2, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"learning_rate": 1.301345943278496e-05, |
|
"loss": 1.2096, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"learning_rate": 1.293655338275631e-05, |
|
"loss": 1.2106, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"learning_rate": 1.2859456783986892e-05, |
|
"loss": 1.189, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"learning_rate": 1.2782174639164528e-05, |
|
"loss": 1.1913, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"learning_rate": 1.270471196301684e-05, |
|
"loss": 1.214, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"learning_rate": 1.262707378198587e-05, |
|
"loss": 1.2046, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"learning_rate": 1.2549265133901934e-05, |
|
"loss": 1.1957, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"learning_rate": 1.2471291067656696e-05, |
|
"loss": 1.1824, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"learning_rate": 1.2393156642875579e-05, |
|
"loss": 1.2013, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"learning_rate": 1.2314866929589434e-05, |
|
"loss": 1.209, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"learning_rate": 1.2236427007905558e-05, |
|
"loss": 1.1864, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"learning_rate": 1.2157841967678064e-05, |
|
"loss": 1.2149, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"learning_rate": 1.2079116908177592e-05, |
|
"loss": 1.1946, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"learning_rate": 1.2000256937760446e-05, |
|
"loss": 1.2138, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"learning_rate": 1.1921267173537085e-05, |
|
"loss": 1.2145, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"learning_rate": 1.1842152741040117e-05, |
|
"loss": 1.1983, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"learning_rate": 1.1762918773891691e-05, |
|
"loss": 1.1901, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"learning_rate": 1.1683570413470384e-05, |
|
"loss": 1.2152, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"learning_rate": 1.1604112808577603e-05, |
|
"loss": 1.2067, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"learning_rate": 1.1524551115103455e-05, |
|
"loss": 1.1926, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"eval_loss": 1.168326735496521, |
|
"eval_runtime": 421.3368, |
|
"eval_samples_per_second": 38.368, |
|
"eval_steps_per_second": 1.201, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"learning_rate": 1.1444890495692214e-05, |
|
"loss": 1.1972, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"learning_rate": 1.1365136119407318e-05, |
|
"loss": 1.1905, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"learning_rate": 1.1285293161395948e-05, |
|
"loss": 1.1882, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"learning_rate": 1.1205366802553231e-05, |
|
"loss": 1.2066, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"learning_rate": 1.1125362229186056e-05, |
|
"loss": 1.1901, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"learning_rate": 1.1045284632676535e-05, |
|
"loss": 1.1869, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"learning_rate": 1.0965139209145153e-05, |
|
"loss": 1.1932, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"learning_rate": 1.0884931159113585e-05, |
|
"loss": 1.182, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"learning_rate": 1.0804665687167262e-05, |
|
"loss": 1.214, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"learning_rate": 1.0724348001617626e-05, |
|
"loss": 1.193, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"learning_rate": 1.0643983314164195e-05, |
|
"loss": 1.207, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"learning_rate": 1.0563576839556375e-05, |
|
"loss": 1.2182, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"learning_rate": 1.0483133795255072e-05, |
|
"loss": 1.1954, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"learning_rate": 1.0402659401094154e-05, |
|
"loss": 1.1991, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"learning_rate": 1.0322158878941733e-05, |
|
"loss": 1.1939, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"learning_rate": 1.0241637452361323e-05, |
|
"loss": 1.184, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"learning_rate": 1.0161100346272913e-05, |
|
"loss": 1.2052, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"learning_rate": 1.0080552786613899e-05, |
|
"loss": 1.2077, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"learning_rate": 1e-05, |
|
"loss": 1.1848, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"learning_rate": 9.919447213386103e-06, |
|
"loss": 1.1933, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"eval_loss": 1.1648716926574707, |
|
"eval_runtime": 424.4832, |
|
"eval_samples_per_second": 38.084, |
|
"eval_steps_per_second": 1.192, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"learning_rate": 9.838899653727088e-06, |
|
"loss": 1.1803, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"learning_rate": 9.75836254763868e-06, |
|
"loss": 1.1981, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"learning_rate": 9.677841121058274e-06, |
|
"loss": 1.199, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"learning_rate": 9.597340598905851e-06, |
|
"loss": 1.215, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"learning_rate": 9.516866204744932e-06, |
|
"loss": 1.1851, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"learning_rate": 9.436423160443625e-06, |
|
"loss": 1.1746, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"learning_rate": 9.356016685835807e-06, |
|
"loss": 1.1955, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"learning_rate": 9.275651998382377e-06, |
|
"loss": 1.1971, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"learning_rate": 9.195334312832742e-06, |
|
"loss": 1.1919, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"learning_rate": 9.115068840886418e-06, |
|
"loss": 1.1744, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"learning_rate": 9.034860790854848e-06, |
|
"loss": 1.1884, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"learning_rate": 8.954715367323468e-06, |
|
"loss": 1.1796, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"learning_rate": 8.874637770813947e-06, |
|
"loss": 1.191, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"learning_rate": 8.79463319744677e-06, |
|
"loss": 1.196, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"learning_rate": 8.714706838604056e-06, |
|
"loss": 1.2032, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"learning_rate": 8.634863880592687e-06, |
|
"loss": 1.1897, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"learning_rate": 8.55510950430779e-06, |
|
"loss": 1.1964, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"learning_rate": 8.475448884896546e-06, |
|
"loss": 1.1858, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"learning_rate": 8.395887191422397e-06, |
|
"loss": 1.1918, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"learning_rate": 8.316429586529616e-06, |
|
"loss": 1.1893, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"eval_loss": 1.1618335247039795, |
|
"eval_runtime": 423.2525, |
|
"eval_samples_per_second": 38.195, |
|
"eval_steps_per_second": 1.196, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"learning_rate": 8.23708122610831e-06, |
|
"loss": 1.1907, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"learning_rate": 8.157847258959885e-06, |
|
"loss": 1.2021, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"learning_rate": 8.078732826462917e-06, |
|
"loss": 1.1999, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"learning_rate": 7.999743062239557e-06, |
|
"loss": 1.1716, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"learning_rate": 7.92088309182241e-06, |
|
"loss": 1.1864, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"learning_rate": 7.84215803232194e-06, |
|
"loss": 1.1855, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"learning_rate": 7.763572992094448e-06, |
|
"loss": 1.1899, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"learning_rate": 7.685133070410571e-06, |
|
"loss": 1.1814, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"learning_rate": 7.606843357124426e-06, |
|
"loss": 1.2001, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"learning_rate": 7.5287089323433035e-06, |
|
"loss": 1.1886, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"learning_rate": 7.450734866098066e-06, |
|
"loss": 1.2065, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"learning_rate": 7.372926218014131e-06, |
|
"loss": 1.1812, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"learning_rate": 7.2952880369831635e-06, |
|
"loss": 1.1729, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"learning_rate": 7.217825360835475e-06, |
|
"loss": 1.1843, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"learning_rate": 7.140543216013109e-06, |
|
"loss": 1.1864, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"learning_rate": 7.063446617243695e-06, |
|
"loss": 1.1875, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"learning_rate": 6.986540567215043e-06, |
|
"loss": 1.209, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"learning_rate": 6.909830056250527e-06, |
|
"loss": 1.2043, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"learning_rate": 6.833320061985278e-06, |
|
"loss": 1.1849, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"learning_rate": 6.757015549043174e-06, |
|
"loss": 1.2029, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"eval_loss": 1.1593303680419922, |
|
"eval_runtime": 425.0269, |
|
"eval_samples_per_second": 38.035, |
|
"eval_steps_per_second": 1.191, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"learning_rate": 6.680921468714718e-06, |
|
"loss": 1.182, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"learning_rate": 6.605042758635729e-06, |
|
"loss": 1.1861, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"learning_rate": 6.529384342466971e-06, |
|
"loss": 1.1725, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"learning_rate": 6.453951129574644e-06, |
|
"loss": 1.1873, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"learning_rate": 6.378748014711834e-06, |
|
"loss": 1.1856, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"learning_rate": 6.30377987770089e-06, |
|
"loss": 1.1836, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"learning_rate": 6.229051583116796e-06, |
|
"loss": 1.1768, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"learning_rate": 6.154567979971493e-06, |
|
"loss": 1.1871, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"learning_rate": 6.080333901399252e-06, |
|
"loss": 1.1747, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"learning_rate": 6.006354164343047e-06, |
|
"loss": 1.2114, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"learning_rate": 5.932633569242e-06, |
|
"loss": 1.1872, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"learning_rate": 5.859176899719883e-06, |
|
"loss": 1.1945, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"learning_rate": 5.785988922274711e-06, |
|
"loss": 1.1785, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"learning_rate": 5.713074385969457e-06, |
|
"loss": 1.2026, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"learning_rate": 5.640438022123898e-06, |
|
"loss": 1.1928, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"learning_rate": 5.5680845440075885e-06, |
|
"loss": 1.1775, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"learning_rate": 5.496018646534032e-06, |
|
"loss": 1.1988, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"learning_rate": 5.424245005956048e-06, |
|
"loss": 1.199, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"learning_rate": 5.352768279562315e-06, |
|
"loss": 1.2145, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"learning_rate": 5.28159310537518e-06, |
|
"loss": 1.2201, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"eval_loss": 1.1572028398513794, |
|
"eval_runtime": 422.5597, |
|
"eval_samples_per_second": 38.257, |
|
"eval_steps_per_second": 1.197, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"learning_rate": 5.210724101849696e-06, |
|
"loss": 1.2036, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"learning_rate": 5.14016586757394e-06, |
|
"loss": 1.1714, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"learning_rate": 5.069922980970626e-06, |
|
"loss": 1.164, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"learning_rate": 5.000000000000003e-06, |
|
"loss": 1.1985, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"learning_rate": 4.930401461864099e-06, |
|
"loss": 1.1966, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"learning_rate": 4.861131882712314e-06, |
|
"loss": 1.1939, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"learning_rate": 4.7921957573483756e-06, |
|
"loss": 1.2031, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"learning_rate": 4.7235975589386715e-06, |
|
"loss": 1.1858, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"learning_rate": 4.655341738721989e-06, |
|
"loss": 1.1909, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"learning_rate": 4.587432725720687e-06, |
|
"loss": 1.1826, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"learning_rate": 4.519874926453303e-06, |
|
"loss": 1.1905, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"learning_rate": 4.4526727246486116e-06, |
|
"loss": 1.1671, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"learning_rate": 4.385830480961192e-06, |
|
"loss": 1.196, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"learning_rate": 4.319352532688444e-06, |
|
"loss": 1.1855, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"learning_rate": 4.2532431934891646e-06, |
|
"loss": 1.1721, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"learning_rate": 4.187506753103637e-06, |
|
"loss": 1.1905, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"learning_rate": 4.12214747707527e-06, |
|
"loss": 1.1937, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"learning_rate": 4.057169606473828e-06, |
|
"loss": 1.1809, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"learning_rate": 3.99257735762021e-06, |
|
"loss": 1.1805, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"learning_rate": 3.9283749218128885e-06, |
|
"loss": 1.1741, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"eval_loss": 1.155676007270813, |
|
"eval_runtime": 424.7643, |
|
"eval_samples_per_second": 38.059, |
|
"eval_steps_per_second": 1.191, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"learning_rate": 3.864566465055913e-06, |
|
"loss": 1.178, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"learning_rate": 3.8011561277885965e-06, |
|
"loss": 1.1738, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"learning_rate": 3.738148024616863e-06, |
|
"loss": 1.2035, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"learning_rate": 3.6755462440462288e-06, |
|
"loss": 1.1699, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"learning_rate": 3.6133548482165225e-06, |
|
"loss": 1.1839, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"learning_rate": 3.5515778726382967e-06, |
|
"loss": 1.1988, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"learning_rate": 3.4902193259309627e-06, |
|
"loss": 1.1747, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"learning_rate": 3.4292831895626944e-06, |
|
"loss": 1.1824, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"learning_rate": 3.3687734175920505e-06, |
|
"loss": 1.1772, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"learning_rate": 3.308693936411421e-06, |
|
"loss": 1.1655, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"learning_rate": 3.2490486444922396e-06, |
|
"loss": 1.1734, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"learning_rate": 3.1898414121320277e-06, |
|
"loss": 1.1759, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"learning_rate": 3.131076081203247e-06, |
|
"loss": 1.1901, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"learning_rate": 3.0727564649040066e-06, |
|
"loss": 1.174, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"learning_rate": 3.0148863475106315e-06, |
|
"loss": 1.1773, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"learning_rate": 2.9574694841321082e-06, |
|
"loss": 1.1741, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"learning_rate": 2.900509600466418e-06, |
|
"loss": 1.179, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"learning_rate": 2.8440103925587904e-06, |
|
"loss": 1.1896, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"learning_rate": 2.7879755265618558e-06, |
|
"loss": 1.1705, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"learning_rate": 2.73240863849777e-06, |
|
"loss": 1.1813, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"eval_loss": 1.1545099020004272, |
|
"eval_runtime": 425.9638, |
|
"eval_samples_per_second": 37.952, |
|
"eval_steps_per_second": 1.188, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"learning_rate": 2.6773133340222677e-06, |
|
"loss": 1.1822, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"learning_rate": 2.622693188190699e-06, |
|
"loss": 1.1801, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"learning_rate": 2.5685517452260566e-06, |
|
"loss": 1.1689, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"learning_rate": 2.514892518288988e-06, |
|
"loss": 1.1649, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"learning_rate": 2.4617189892498326e-06, |
|
"loss": 1.1727, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"learning_rate": 2.4090346084626857e-06, |
|
"loss": 1.1716, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"learning_rate": 2.3568427945415163e-06, |
|
"loss": 1.1942, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"learning_rate": 2.3051469341383403e-06, |
|
"loss": 1.1729, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"learning_rate": 2.2539503817234553e-06, |
|
"loss": 1.1856, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"learning_rate": 2.2032564593677773e-06, |
|
"loss": 1.1631, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"learning_rate": 2.153068456527283e-06, |
|
"loss": 1.1741, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"learning_rate": 2.103389629829551e-06, |
|
"loss": 1.1656, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"learning_rate": 2.0542232028624585e-06, |
|
"loss": 1.1743, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"learning_rate": 2.0055723659649907e-06, |
|
"loss": 1.1863, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"learning_rate": 1.9574402760202315e-06, |
|
"loss": 1.1892, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"learning_rate": 1.9098300562505266e-06, |
|
"loss": 1.1999, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"learning_rate": 1.8627447960148036e-06, |
|
"loss": 1.173, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"learning_rate": 1.8161875506081294e-06, |
|
"loss": 1.1957, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"learning_rate": 1.7701613410634367e-06, |
|
"loss": 1.178, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"learning_rate": 1.7246691539555027e-06, |
|
"loss": 1.1668, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"eval_loss": 1.153558611869812, |
|
"eval_runtime": 426.9817, |
|
"eval_samples_per_second": 37.861, |
|
"eval_steps_per_second": 1.185, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"learning_rate": 1.6797139412071583e-06, |
|
"loss": 1.1847, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"learning_rate": 1.6352986198977327e-06, |
|
"loss": 1.1803, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"learning_rate": 1.5914260720737796e-06, |
|
"loss": 1.1755, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"learning_rate": 1.5480991445620541e-06, |
|
"loss": 1.1839, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"learning_rate": 1.5053206487847916e-06, |
|
"loss": 1.1679, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"learning_rate": 1.4630933605772801e-06, |
|
"loss": 1.1912, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"learning_rate": 1.4214200200077343e-06, |
|
"loss": 1.172, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"learning_rate": 1.3803033311995072e-06, |
|
"loss": 1.193, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"learning_rate": 1.339745962155613e-06, |
|
"loss": 1.1819, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"learning_rate": 1.2997505445856085e-06, |
|
"loss": 1.1911, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"learning_rate": 1.2603196737348211e-06, |
|
"loss": 1.1804, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"learning_rate": 1.2214559082159538e-06, |
|
"loss": 1.1797, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"learning_rate": 1.1831617698430609e-06, |
|
"loss": 1.1686, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"learning_rate": 1.1454397434679022e-06, |
|
"loss": 1.1825, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"learning_rate": 1.1082922768187098e-06, |
|
"loss": 1.1881, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"learning_rate": 1.0717217803413605e-06, |
|
"loss": 1.1746, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"learning_rate": 1.0357306270429623e-06, |
|
"loss": 1.1833, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"learning_rate": 1.0003211523378798e-06, |
|
"loss": 1.1932, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"learning_rate": 9.65495653896179e-07, |
|
"loss": 1.1858, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"learning_rate": 9.312563914945461e-07, |
|
"loss": 1.1495, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"eval_loss": 1.1530314683914185, |
|
"eval_runtime": 426.2825, |
|
"eval_samples_per_second": 37.923, |
|
"eval_steps_per_second": 1.187, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"learning_rate": 8.976055868696543e-07, |
|
"loss": 1.1573, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"learning_rate": 8.645454235739903e-07, |
|
"loss": 1.1824, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"learning_rate": 8.320780468341761e-07, |
|
"loss": 1.1805, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"learning_rate": 8.002055634117578e-07, |
|
"loss": 1.1705, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"learning_rate": 7.689300414665124e-07, |
|
"loss": 1.1755, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"learning_rate": 7.382535104222366e-07, |
|
"loss": 1.1529, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"learning_rate": 7.08177960835068e-07, |
|
"loss": 1.1716, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"learning_rate": 6.787053442643233e-07, |
|
"loss": 1.1828, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"learning_rate": 6.498375731458529e-07, |
|
"loss": 1.1944, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"learning_rate": 6.215765206679569e-07, |
|
"loss": 1.1898, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"learning_rate": 5.939240206498287e-07, |
|
"loss": 1.169, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"learning_rate": 5.668818674225684e-07, |
|
"loss": 1.1748, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"learning_rate": 5.404518157127481e-07, |
|
"loss": 1.1664, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"learning_rate": 5.146355805285452e-07, |
|
"loss": 1.1663, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"learning_rate": 4.894348370484648e-07, |
|
"loss": 1.1845, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"learning_rate": 4.6485122051263764e-07, |
|
"loss": 1.168, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"learning_rate": 4.408863261167096e-07, |
|
"loss": 1.1911, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"learning_rate": 4.1754170890833777e-07, |
|
"loss": 1.1901, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"learning_rate": 3.9481888368627764e-07, |
|
"loss": 1.1845, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"learning_rate": 3.7271932490209327e-07, |
|
"loss": 1.1595, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"eval_loss": 1.1527146100997925, |
|
"eval_runtime": 427.0976, |
|
"eval_samples_per_second": 37.851, |
|
"eval_steps_per_second": 1.185, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"learning_rate": 3.5124446656448654e-07, |
|
"loss": 1.1582, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"learning_rate": 3.303957021462378e-07, |
|
"loss": 1.1688, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"learning_rate": 3.101743844937943e-07, |
|
"loss": 1.1785, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"learning_rate": 2.905818257394799e-07, |
|
"loss": 1.182, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"learning_rate": 2.716192972163556e-07, |
|
"loss": 1.1625, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"learning_rate": 2.532880293757223e-07, |
|
"loss": 1.1795, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"learning_rate": 2.355892117072789e-07, |
|
"loss": 1.1623, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"learning_rate": 2.1852399266194312e-07, |
|
"loss": 1.1802, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"learning_rate": 2.0209347957732328e-07, |
|
"loss": 1.1782, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"learning_rate": 1.8629873860586567e-07, |
|
"loss": 1.153, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"learning_rate": 1.711407946456789e-07, |
|
"loss": 1.1797, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"learning_rate": 1.5662063127402262e-07, |
|
"loss": 1.1657, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"learning_rate": 1.4273919068349184e-07, |
|
"loss": 1.1736, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"learning_rate": 1.2949737362087156e-07, |
|
"loss": 1.1771, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"learning_rate": 1.1689603932869664e-07, |
|
"loss": 1.1638, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"learning_rate": 1.0493600548948879e-07, |
|
"loss": 1.1764, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"learning_rate": 9.36180481727067e-08, |
|
"loss": 1.1829, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"learning_rate": 8.29429017843797e-08, |
|
"loss": 1.1888, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"learning_rate": 7.291125901946027e-08, |
|
"loss": 1.1743, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"learning_rate": 6.352377081687011e-08, |
|
"loss": 1.1607, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"eval_loss": 1.1526199579238892, |
|
"eval_runtime": 426.4931, |
|
"eval_samples_per_second": 37.904, |
|
"eval_steps_per_second": 1.186, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"learning_rate": 5.4781046317267103e-08, |
|
"loss": 1.1562, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"learning_rate": 4.6683652823513725e-08, |
|
"loss": 1.1882, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"learning_rate": 3.923211576387087e-08, |
|
"loss": 1.178, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"learning_rate": 3.242691865790071e-08, |
|
"loss": 1.165, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"learning_rate": 2.6268503085089547e-08, |
|
"loss": 1.1853, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"learning_rate": 2.0757268656198536e-08, |
|
"loss": 1.1643, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"learning_rate": 1.5893572987333293e-08, |
|
"loss": 1.1776, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"learning_rate": 1.1677731676733584e-08, |
|
"loss": 1.1739, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"learning_rate": 8.110018284304132e-09, |
|
"loss": 1.1549, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"learning_rate": 5.190664313851068e-09, |
|
"loss": 1.1693, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"learning_rate": 2.9198591980705847e-09, |
|
"loss": 1.1632, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"learning_rate": 1.2977502862532298e-09, |
|
"loss": 1.1761, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"learning_rate": 3.244428347204398e-10, |
|
"loss": 1.1803, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"learning_rate": 0.0, |
|
"loss": 1.1949, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 2070, |
|
"total_flos": 2.0719233559829676e+19, |
|
"train_loss": 1.2525324755820675, |
|
"train_runtime": 32325.2724, |
|
"train_samples_per_second": 8.198, |
|
"train_steps_per_second": 0.064 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 2070, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"total_flos": 2.0719233559829676e+19, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|