|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 2730, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 2.5403712464892836, |
|
"learning_rate": 7.326007326007327e-08, |
|
"loss": 1.379, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 2.5059454164116177, |
|
"learning_rate": 3.6630036630036635e-07, |
|
"loss": 1.411, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 2.403039830631443, |
|
"learning_rate": 7.326007326007327e-07, |
|
"loss": 1.3904, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.187796179583603, |
|
"learning_rate": 1.098901098901099e-06, |
|
"loss": 1.3698, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.4489435205825498, |
|
"learning_rate": 1.4652014652014654e-06, |
|
"loss": 1.2609, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.483113856101743, |
|
"learning_rate": 1.8315018315018316e-06, |
|
"loss": 1.1341, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.7051067484479403, |
|
"learning_rate": 2.197802197802198e-06, |
|
"loss": 1.0478, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.7254079611231652, |
|
"learning_rate": 2.564102564102564e-06, |
|
"loss": 1.0312, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.6389491910662315, |
|
"learning_rate": 2.930402930402931e-06, |
|
"loss": 0.9897, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.500844994643637, |
|
"learning_rate": 3.2967032967032968e-06, |
|
"loss": 0.9549, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.4201833928940666, |
|
"learning_rate": 3.663003663003663e-06, |
|
"loss": 0.9373, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.370974019020061, |
|
"learning_rate": 4.0293040293040296e-06, |
|
"loss": 0.9118, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.34743939582124356, |
|
"learning_rate": 4.395604395604396e-06, |
|
"loss": 0.8832, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.30550288282826443, |
|
"learning_rate": 4.761904761904762e-06, |
|
"loss": 0.8905, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.3877890435103539, |
|
"learning_rate": 5.128205128205128e-06, |
|
"loss": 0.8745, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.4090389705350457, |
|
"learning_rate": 5.494505494505495e-06, |
|
"loss": 0.869, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.28471599620620913, |
|
"learning_rate": 5.860805860805862e-06, |
|
"loss": 0.8701, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.2835464196942085, |
|
"learning_rate": 6.227106227106228e-06, |
|
"loss": 0.8566, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.3339230231336609, |
|
"learning_rate": 6.5934065934065935e-06, |
|
"loss": 0.8926, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.26409030663411054, |
|
"learning_rate": 6.95970695970696e-06, |
|
"loss": 0.8639, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.2548657900119433, |
|
"learning_rate": 7.326007326007326e-06, |
|
"loss": 0.8574, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.33583619074613247, |
|
"learning_rate": 7.692307692307694e-06, |
|
"loss": 0.8371, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.26284318740686946, |
|
"learning_rate": 8.058608058608059e-06, |
|
"loss": 0.8209, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.27047354400674645, |
|
"learning_rate": 8.424908424908426e-06, |
|
"loss": 0.8125, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.25311492465959046, |
|
"learning_rate": 8.791208791208792e-06, |
|
"loss": 0.8371, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.27273984018778547, |
|
"learning_rate": 9.157509157509158e-06, |
|
"loss": 0.8361, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.2789308175031694, |
|
"learning_rate": 9.523809523809525e-06, |
|
"loss": 0.7995, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.26030914277390094, |
|
"learning_rate": 9.890109890109892e-06, |
|
"loss": 0.8137, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.8625694710794981, |
|
"learning_rate": 1.0256410256410256e-05, |
|
"loss": 0.7968, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.26526694267370393, |
|
"learning_rate": 1.0622710622710623e-05, |
|
"loss": 0.8011, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.2786629865850713, |
|
"learning_rate": 1.098901098901099e-05, |
|
"loss": 0.7944, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.2547160983307644, |
|
"learning_rate": 1.1355311355311356e-05, |
|
"loss": 0.8072, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.252471949121061, |
|
"learning_rate": 1.1721611721611723e-05, |
|
"loss": 0.7941, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.25858945532947464, |
|
"learning_rate": 1.2087912087912089e-05, |
|
"loss": 0.7951, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.247980244139678, |
|
"learning_rate": 1.2454212454212456e-05, |
|
"loss": 0.7858, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.255903929999623, |
|
"learning_rate": 1.2820512820512823e-05, |
|
"loss": 0.7878, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.6019780019252425, |
|
"learning_rate": 1.3186813186813187e-05, |
|
"loss": 0.7795, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.27779059252699495, |
|
"learning_rate": 1.3553113553113554e-05, |
|
"loss": 0.7754, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.2710280200562755, |
|
"learning_rate": 1.391941391941392e-05, |
|
"loss": 0.7855, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.2743116770469599, |
|
"learning_rate": 1.4285714285714287e-05, |
|
"loss": 0.7565, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.26037793315040886, |
|
"learning_rate": 1.4652014652014653e-05, |
|
"loss": 0.7726, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.26090200965795596, |
|
"learning_rate": 1.501831501831502e-05, |
|
"loss": 0.7653, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.2632394726032374, |
|
"learning_rate": 1.5384615384615387e-05, |
|
"loss": 0.777, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.2540381175124688, |
|
"learning_rate": 1.575091575091575e-05, |
|
"loss": 0.7433, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.2627604876525692, |
|
"learning_rate": 1.6117216117216118e-05, |
|
"loss": 0.7597, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.2651216093866056, |
|
"learning_rate": 1.6483516483516486e-05, |
|
"loss": 0.7609, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.24401238946000964, |
|
"learning_rate": 1.6849816849816853e-05, |
|
"loss": 0.7469, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.2547534354636469, |
|
"learning_rate": 1.721611721611722e-05, |
|
"loss": 0.7641, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.24893607597108447, |
|
"learning_rate": 1.7582417582417584e-05, |
|
"loss": 0.7323, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.2530974475033568, |
|
"learning_rate": 1.794871794871795e-05, |
|
"loss": 0.7533, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.2733773460887857, |
|
"learning_rate": 1.8315018315018315e-05, |
|
"loss": 0.7538, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.27130243796420767, |
|
"learning_rate": 1.8681318681318682e-05, |
|
"loss": 0.7665, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.25889603349835133, |
|
"learning_rate": 1.904761904761905e-05, |
|
"loss": 0.737, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.2758988715664599, |
|
"learning_rate": 1.9413919413919417e-05, |
|
"loss": 0.7587, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.24591163232882832, |
|
"learning_rate": 1.9780219780219784e-05, |
|
"loss": 0.7503, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.29213859652031626, |
|
"learning_rate": 1.9999967302150437e-05, |
|
"loss": 0.7544, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.23841997418474895, |
|
"learning_rate": 1.9999599453798523e-05, |
|
"loss": 0.738, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.23103339591842176, |
|
"learning_rate": 1.9998822899867633e-05, |
|
"loss": 0.7316, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.24038013908936962, |
|
"learning_rate": 1.9997637672097222e-05, |
|
"loss": 0.7202, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.24246381579537735, |
|
"learning_rate": 1.9996043818930153e-05, |
|
"loss": 0.7463, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.24527189059683477, |
|
"learning_rate": 1.9994041405510705e-05, |
|
"loss": 0.755, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.24371499098074792, |
|
"learning_rate": 1.999163051368191e-05, |
|
"loss": 0.7599, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.2575400741876107, |
|
"learning_rate": 1.9988811241982206e-05, |
|
"loss": 0.7443, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.2404592716601957, |
|
"learning_rate": 1.9985583705641418e-05, |
|
"loss": 0.7327, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.24637921462827092, |
|
"learning_rate": 1.9981948036576045e-05, |
|
"loss": 0.7339, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.24681007900489016, |
|
"learning_rate": 1.997790438338385e-05, |
|
"loss": 0.7265, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.2690257655867855, |
|
"learning_rate": 1.997345291133783e-05, |
|
"loss": 0.7232, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.2534124946734991, |
|
"learning_rate": 1.9968593802379405e-05, |
|
"loss": 0.7327, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.23796973039683816, |
|
"learning_rate": 1.9963327255111033e-05, |
|
"loss": 0.7218, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.23546876853519053, |
|
"learning_rate": 1.9957653484788054e-05, |
|
"loss": 0.7291, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.2629306598176605, |
|
"learning_rate": 1.9951572723309918e-05, |
|
"loss": 0.7177, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.2392128886783249, |
|
"learning_rate": 1.99450852192107e-05, |
|
"loss": 0.7082, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.23159963577625967, |
|
"learning_rate": 1.9938191237648924e-05, |
|
"loss": 0.7031, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.2522691194103305, |
|
"learning_rate": 1.9930891060396757e-05, |
|
"loss": 0.7094, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.2708112490882325, |
|
"learning_rate": 1.992318498582846e-05, |
|
"loss": 0.7201, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.2293891910969228, |
|
"learning_rate": 1.9915073328908217e-05, |
|
"loss": 0.7144, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.25206778778834, |
|
"learning_rate": 1.9906556421177256e-05, |
|
"loss": 0.7234, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.2518785514497742, |
|
"learning_rate": 1.989763461074029e-05, |
|
"loss": 0.7141, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.2378420502334953, |
|
"learning_rate": 1.9888308262251286e-05, |
|
"loss": 0.7231, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.24009865726485224, |
|
"learning_rate": 1.987857775689859e-05, |
|
"loss": 0.7187, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.28274675300361035, |
|
"learning_rate": 1.9868443492389307e-05, |
|
"loss": 0.7044, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.22963475241854425, |
|
"learning_rate": 1.985790588293308e-05, |
|
"loss": 0.7104, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.2377445508195736, |
|
"learning_rate": 1.9846965359225127e-05, |
|
"loss": 0.6946, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.2608944217378828, |
|
"learning_rate": 1.9835622368428673e-05, |
|
"loss": 0.7113, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.24157694006779415, |
|
"learning_rate": 1.9823877374156647e-05, |
|
"loss": 0.698, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.23398315907950198, |
|
"learning_rate": 1.9811730856452754e-05, |
|
"loss": 0.7257, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.23894709403347192, |
|
"learning_rate": 1.9799183311771823e-05, |
|
"loss": 0.7184, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.23431405547670192, |
|
"learning_rate": 1.9786235252959555e-05, |
|
"loss": 0.7128, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.2376569203749501, |
|
"learning_rate": 1.977288720923153e-05, |
|
"loss": 0.7203, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.23231143839603255, |
|
"learning_rate": 1.9759139726151597e-05, |
|
"loss": 0.7082, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.22602078925455668, |
|
"learning_rate": 1.9744993365609563e-05, |
|
"loss": 0.6944, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.22861362046539935, |
|
"learning_rate": 1.973044870579824e-05, |
|
"loss": 0.698, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.24188463913379904, |
|
"learning_rate": 1.9715506341189795e-05, |
|
"loss": 0.7015, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.2327706520398592, |
|
"learning_rate": 1.970016688251147e-05, |
|
"loss": 0.7054, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.2298533583392459, |
|
"learning_rate": 1.9684430956720613e-05, |
|
"loss": 0.7005, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.23290007131084822, |
|
"learning_rate": 1.966829920697905e-05, |
|
"loss": 0.7098, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.23281363580583736, |
|
"learning_rate": 1.9651772292626804e-05, |
|
"loss": 0.6818, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.23817560008326671, |
|
"learning_rate": 1.963485088915514e-05, |
|
"loss": 0.7088, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.2350735880544215, |
|
"learning_rate": 1.961753568817896e-05, |
|
"loss": 0.7066, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.23884064719455786, |
|
"learning_rate": 1.959982739740854e-05, |
|
"loss": 0.7042, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.21790596438572063, |
|
"learning_rate": 1.9581726740620585e-05, |
|
"loss": 0.6757, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.23917884378106077, |
|
"learning_rate": 1.9563234457628678e-05, |
|
"loss": 0.6921, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.2399642009348881, |
|
"learning_rate": 1.954435130425301e-05, |
|
"loss": 0.7047, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.23880300515157055, |
|
"learning_rate": 1.952507805228951e-05, |
|
"loss": 0.6884, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.23179275898540247, |
|
"learning_rate": 1.9505415489478293e-05, |
|
"loss": 0.6932, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.2216104142333809, |
|
"learning_rate": 1.9485364419471454e-05, |
|
"loss": 0.6728, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.22858244743422157, |
|
"learning_rate": 1.9464925661800247e-05, |
|
"loss": 0.6809, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.22424934505435848, |
|
"learning_rate": 1.9444100051841556e-05, |
|
"loss": 0.6967, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.2895129667800956, |
|
"learning_rate": 1.9422888440783773e-05, |
|
"loss": 0.6989, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.240457298966355, |
|
"learning_rate": 1.9401291695592e-05, |
|
"loss": 0.6818, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.22825665925272676, |
|
"learning_rate": 1.9379310698972618e-05, |
|
"loss": 0.6922, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.2172530625853638, |
|
"learning_rate": 1.935694634933721e-05, |
|
"loss": 0.6627, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.22221082939742373, |
|
"learning_rate": 1.933419956076584e-05, |
|
"loss": 0.6744, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.2264067795562855, |
|
"learning_rate": 1.9311071262969675e-05, |
|
"loss": 0.6641, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.2254067456859091, |
|
"learning_rate": 1.9287562401253023e-05, |
|
"loss": 0.6892, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.2370414298818578, |
|
"learning_rate": 1.9263673936474662e-05, |
|
"loss": 0.6779, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.22087990811265876, |
|
"learning_rate": 1.9239406845008583e-05, |
|
"loss": 0.6805, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.22405579155571978, |
|
"learning_rate": 1.921476211870408e-05, |
|
"loss": 0.6661, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.22938270799667376, |
|
"learning_rate": 1.918974076484521e-05, |
|
"loss": 0.6773, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.24245138817040104, |
|
"learning_rate": 1.916434380610963e-05, |
|
"loss": 0.6922, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.22591029702471543, |
|
"learning_rate": 1.9138572280526795e-05, |
|
"loss": 0.6672, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.22886503680106302, |
|
"learning_rate": 1.911242724143552e-05, |
|
"loss": 0.6574, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.23592282593270214, |
|
"learning_rate": 1.908590975744094e-05, |
|
"loss": 0.6701, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.2263018036424681, |
|
"learning_rate": 1.9059020912370836e-05, |
|
"loss": 0.6645, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.23573388350757823, |
|
"learning_rate": 1.9031761805231322e-05, |
|
"loss": 0.6789, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.2264306405740615, |
|
"learning_rate": 1.9004133550161953e-05, |
|
"loss": 0.6798, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.23127140958344955, |
|
"learning_rate": 1.8976137276390145e-05, |
|
"loss": 0.6918, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.22485640726090023, |
|
"learning_rate": 1.894777412818506e-05, |
|
"loss": 0.6579, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.2164772947483346, |
|
"learning_rate": 1.891904526481083e-05, |
|
"loss": 0.6729, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.21930341038212908, |
|
"learning_rate": 1.8889951860479165e-05, |
|
"loss": 0.6636, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.2212836429863629, |
|
"learning_rate": 1.8860495104301346e-05, |
|
"loss": 0.7046, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.236222638103693, |
|
"learning_rate": 1.8830676200239666e-05, |
|
"loss": 0.6544, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.23689174767504043, |
|
"learning_rate": 1.8800496367058187e-05, |
|
"loss": 0.6619, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.22794708045955256, |
|
"learning_rate": 1.8769956838272937e-05, |
|
"loss": 0.6536, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.2284737343207547, |
|
"learning_rate": 1.8739058862101487e-05, |
|
"loss": 0.6716, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.22074952318779162, |
|
"learning_rate": 1.8707803701411946e-05, |
|
"loss": 0.671, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.2215061880912214, |
|
"learning_rate": 1.8676192633671342e-05, |
|
"loss": 0.6865, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.23413437096611026, |
|
"learning_rate": 1.8644226950893394e-05, |
|
"loss": 0.6707, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.22955684806244817, |
|
"learning_rate": 1.861190795958573e-05, |
|
"loss": 0.6835, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.22476186793320183, |
|
"learning_rate": 1.857923698069646e-05, |
|
"loss": 0.6597, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.22821748154651522, |
|
"learning_rate": 1.8546215349560204e-05, |
|
"loss": 0.6769, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.22293250318189695, |
|
"learning_rate": 1.8512844415843514e-05, |
|
"loss": 0.6828, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.22191425457543576, |
|
"learning_rate": 1.8479125543489694e-05, |
|
"loss": 0.6849, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.22106154189199517, |
|
"learning_rate": 1.844506011066308e-05, |
|
"loss": 0.6877, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.22662154758390868, |
|
"learning_rate": 1.841064950969268e-05, |
|
"loss": 0.6579, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.22395783462295688, |
|
"learning_rate": 1.8375895147015285e-05, |
|
"loss": 0.6808, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.22270568666818555, |
|
"learning_rate": 1.8340798443117992e-05, |
|
"loss": 0.6705, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.2150504585631758, |
|
"learning_rate": 1.8305360832480118e-05, |
|
"loss": 0.6628, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.21416406499964488, |
|
"learning_rate": 1.8269583763514603e-05, |
|
"loss": 0.6602, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.22446619283908673, |
|
"learning_rate": 1.8233468698508786e-05, |
|
"loss": 0.6516, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.22209820477583542, |
|
"learning_rate": 1.819701711356464e-05, |
|
"loss": 0.6719, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.20930564484300637, |
|
"learning_rate": 1.8160230498538464e-05, |
|
"loss": 0.6462, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.2157300926603618, |
|
"learning_rate": 1.8123110356979955e-05, |
|
"loss": 0.6386, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.2278341187144425, |
|
"learning_rate": 1.808565820607078e-05, |
|
"loss": 0.6864, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.22190749729143514, |
|
"learning_rate": 1.8047875576562556e-05, |
|
"loss": 0.6606, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.22188524148964123, |
|
"learning_rate": 1.8009764012714283e-05, |
|
"loss": 0.6369, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.24010299878120459, |
|
"learning_rate": 1.7971325072229227e-05, |
|
"loss": 0.6586, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.2129875521525412, |
|
"learning_rate": 1.7932560326191265e-05, |
|
"loss": 0.6446, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.2284930031799763, |
|
"learning_rate": 1.789347135900066e-05, |
|
"loss": 0.6704, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.21164688299648138, |
|
"learning_rate": 1.7854059768309292e-05, |
|
"loss": 0.6501, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.2185026582076693, |
|
"learning_rate": 1.7814327164955388e-05, |
|
"loss": 0.6504, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.23885266134914798, |
|
"learning_rate": 1.777427517289766e-05, |
|
"loss": 0.6534, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.24735644831293754, |
|
"learning_rate": 1.773390542914894e-05, |
|
"loss": 0.6593, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.21801160079829443, |
|
"learning_rate": 1.7693219583709266e-05, |
|
"loss": 0.6538, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.23033125474961877, |
|
"learning_rate": 1.765221929949845e-05, |
|
"loss": 0.6544, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.21636906024225588, |
|
"learning_rate": 1.7610906252288097e-05, |
|
"loss": 0.6678, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.22862735644017182, |
|
"learning_rate": 1.7569282130633137e-05, |
|
"loss": 0.6676, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.22135157314088189, |
|
"learning_rate": 1.752734863580278e-05, |
|
"loss": 0.6463, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.21472268874726408, |
|
"learning_rate": 1.7485107481711014e-05, |
|
"loss": 0.6525, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.21461981313008696, |
|
"learning_rate": 1.7442560394846518e-05, |
|
"loss": 0.6484, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.21774234081011679, |
|
"learning_rate": 1.739970911420213e-05, |
|
"loss": 0.651, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.21598579831580747, |
|
"learning_rate": 1.7356555391203745e-05, |
|
"loss": 0.6785, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.22611280288574187, |
|
"learning_rate": 1.7313100989638745e-05, |
|
"loss": 0.6579, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.2118379153642089, |
|
"learning_rate": 1.7269347685583913e-05, |
|
"loss": 0.658, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.2098318370691198, |
|
"learning_rate": 1.7225297267332815e-05, |
|
"loss": 0.6628, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.2096338281124951, |
|
"learning_rate": 1.7180951535322742e-05, |
|
"loss": 0.6519, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.2277800903608631, |
|
"learning_rate": 1.7136312302061097e-05, |
|
"loss": 0.6396, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.2104470768411409, |
|
"learning_rate": 1.7091381392051333e-05, |
|
"loss": 0.6434, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.2142332846578606, |
|
"learning_rate": 1.704616064171836e-05, |
|
"loss": 0.6384, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.21219425658206706, |
|
"learning_rate": 1.7000651899333512e-05, |
|
"loss": 0.6346, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.2208443696000645, |
|
"learning_rate": 1.6954857024938976e-05, |
|
"loss": 0.648, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.21055402887891725, |
|
"learning_rate": 1.6908777890271794e-05, |
|
"loss": 0.6472, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.2122526858833542, |
|
"learning_rate": 1.686241637868734e-05, |
|
"loss": 0.6422, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.6909855604171753, |
|
"eval_runtime": 13.3743, |
|
"eval_samples_per_second": 103.034, |
|
"eval_steps_per_second": 0.822, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 0.2376946716415055, |
|
"learning_rate": 1.6815774385082355e-05, |
|
"loss": 0.6025, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 0.21981152940016552, |
|
"learning_rate": 1.6768853815817506e-05, |
|
"loss": 0.6149, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 0.2114064615721331, |
|
"learning_rate": 1.6721656588639444e-05, |
|
"loss": 0.6084, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 0.21343882616611637, |
|
"learning_rate": 1.6674184632602447e-05, |
|
"loss": 0.6192, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 0.20964848533593644, |
|
"learning_rate": 1.6626439887989552e-05, |
|
"loss": 0.577, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 0.21080943869356272, |
|
"learning_rate": 1.6578424306233282e-05, |
|
"loss": 0.5858, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 0.21972421289491473, |
|
"learning_rate": 1.653013984983585e-05, |
|
"loss": 0.5907, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 0.2148077134860289, |
|
"learning_rate": 1.6481588492288985e-05, |
|
"loss": 0.5974, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 0.222495841900069, |
|
"learning_rate": 1.643277221799323e-05, |
|
"loss": 0.5979, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 0.21883932168474696, |
|
"learning_rate": 1.638369302217687e-05, |
|
"loss": 0.6077, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 0.20756222937087987, |
|
"learning_rate": 1.633435291081437e-05, |
|
"loss": 0.5886, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 0.22034735404299097, |
|
"learning_rate": 1.6284753900544384e-05, |
|
"loss": 0.6023, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 0.2178991358059654, |
|
"learning_rate": 1.6234898018587336e-05, |
|
"loss": 0.6077, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 0.21185656324247812, |
|
"learning_rate": 1.618478730266255e-05, |
|
"loss": 0.5891, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 0.2225195916998371, |
|
"learning_rate": 1.6134423800904985e-05, |
|
"loss": 0.6082, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 0.21175099229044173, |
|
"learning_rate": 1.6083809571781498e-05, |
|
"loss": 0.6022, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 0.21693130048041695, |
|
"learning_rate": 1.6032946684006745e-05, |
|
"loss": 0.5877, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 0.20628677851436183, |
|
"learning_rate": 1.598183721645858e-05, |
|
"loss": 0.6025, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 0.21136866931809867, |
|
"learning_rate": 1.5930483258093144e-05, |
|
"loss": 0.6056, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 0.21628647967275302, |
|
"learning_rate": 1.5878886907859423e-05, |
|
"loss": 0.5973, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.21460811563007248, |
|
"learning_rate": 1.5827050274613512e-05, |
|
"loss": 0.6151, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.21019968808729894, |
|
"learning_rate": 1.57749754770324e-05, |
|
"loss": 0.5871, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 0.21602614964752764, |
|
"learning_rate": 1.5722664643527362e-05, |
|
"loss": 0.6088, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 0.21863820245310453, |
|
"learning_rate": 1.567011991215699e-05, |
|
"loss": 0.5968, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 0.21638329517362545, |
|
"learning_rate": 1.561734343053979e-05, |
|
"loss": 0.5879, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 0.21082524836877004, |
|
"learning_rate": 1.5564337355766412e-05, |
|
"loss": 0.583, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 0.2055779159254746, |
|
"learning_rate": 1.551110385431148e-05, |
|
"loss": 0.5934, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 0.22359885758488707, |
|
"learning_rate": 1.5457645101945046e-05, |
|
"loss": 0.5824, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 0.2194479570684916, |
|
"learning_rate": 1.540396328364367e-05, |
|
"loss": 0.6125, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 0.21187983654193793, |
|
"learning_rate": 1.5350060593501086e-05, |
|
"loss": 0.6028, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 0.21206606898137628, |
|
"learning_rate": 1.5295939234638566e-05, |
|
"loss": 0.5934, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 0.20258591515345375, |
|
"learning_rate": 1.5241601419114842e-05, |
|
"loss": 0.5775, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 0.2136809123465599, |
|
"learning_rate": 1.5187049367835709e-05, |
|
"loss": 0.5941, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 0.21728230317859182, |
|
"learning_rate": 1.5132285310463243e-05, |
|
"loss": 0.5832, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 0.2173442514170428, |
|
"learning_rate": 1.507731148532468e-05, |
|
"loss": 0.5896, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.2171859843513773, |
|
"learning_rate": 1.5022130139320916e-05, |
|
"loss": 0.6007, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.22340196606689453, |
|
"learning_rate": 1.4966743527834691e-05, |
|
"loss": 0.6034, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 0.20721993882081483, |
|
"learning_rate": 1.4911153914638388e-05, |
|
"loss": 0.6027, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 0.21607496950020452, |
|
"learning_rate": 1.4855363571801523e-05, |
|
"loss": 0.6128, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 0.21215405353798844, |
|
"learning_rate": 1.4799374779597866e-05, |
|
"loss": 0.583, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 0.21724719376448562, |
|
"learning_rate": 1.474318982641225e-05, |
|
"loss": 0.5913, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 0.22458471606601424, |
|
"learning_rate": 1.4686811008647037e-05, |
|
"loss": 0.612, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 0.21247586240459654, |
|
"learning_rate": 1.463024063062827e-05, |
|
"loss": 0.5855, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 0.22045045025828708, |
|
"learning_rate": 1.457348100451146e-05, |
|
"loss": 0.5883, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.214769725773024, |
|
"learning_rate": 1.4516534450187126e-05, |
|
"loss": 0.5877, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.2160947649465179, |
|
"learning_rate": 1.4459403295185933e-05, |
|
"loss": 0.5951, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 0.21080831461558835, |
|
"learning_rate": 1.4402089874583594e-05, |
|
"loss": 0.5616, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 0.3517838242091068, |
|
"learning_rate": 1.4344596530905412e-05, |
|
"loss": 0.5981, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 0.2170223068256742, |
|
"learning_rate": 1.4286925614030542e-05, |
|
"loss": 0.5962, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 0.20876572174386707, |
|
"learning_rate": 1.4229079481095949e-05, |
|
"loss": 0.5705, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.2114530881165322, |
|
"learning_rate": 1.4171060496400055e-05, |
|
"loss": 0.5831, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 0.2124316227899679, |
|
"learning_rate": 1.4112871031306118e-05, |
|
"loss": 0.5927, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 0.24069796832607096, |
|
"learning_rate": 1.4054513464145303e-05, |
|
"loss": 0.5843, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 0.21761542079259927, |
|
"learning_rate": 1.3995990180119478e-05, |
|
"loss": 0.5913, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 0.22376197027518, |
|
"learning_rate": 1.3937303571203718e-05, |
|
"loss": 0.5937, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 0.21270100949939988, |
|
"learning_rate": 1.387845603604855e-05, |
|
"loss": 0.6087, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 0.2061356457768136, |
|
"learning_rate": 1.3819449979881907e-05, |
|
"loss": 0.5913, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 0.21386723380022615, |
|
"learning_rate": 1.3760287814410822e-05, |
|
"loss": 0.5993, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 0.21228068552627186, |
|
"learning_rate": 1.3700971957722861e-05, |
|
"loss": 0.5957, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 0.22337829074807855, |
|
"learning_rate": 1.3641504834187288e-05, |
|
"loss": 0.5877, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 0.20603083898424746, |
|
"learning_rate": 1.3581888874355969e-05, |
|
"loss": 0.5925, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 0.21192778054346198, |
|
"learning_rate": 1.3522126514864047e-05, |
|
"loss": 0.5891, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 0.2084884476447279, |
|
"learning_rate": 1.346222019833033e-05, |
|
"loss": 0.5834, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 0.2147834275155995, |
|
"learning_rate": 1.3402172373257466e-05, |
|
"loss": 0.5699, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 0.2098959357243527, |
|
"learning_rate": 1.3341985493931877e-05, |
|
"loss": 0.5962, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 0.21524463993944057, |
|
"learning_rate": 1.3281662020323434e-05, |
|
"loss": 0.5732, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 0.21867167573117696, |
|
"learning_rate": 1.3221204417984907e-05, |
|
"loss": 0.5955, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 0.21413713721132682, |
|
"learning_rate": 1.3160615157951218e-05, |
|
"loss": 0.6075, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 0.21839276156580137, |
|
"learning_rate": 1.3099896716638414e-05, |
|
"loss": 0.6037, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 0.2083037589975683, |
|
"learning_rate": 1.303905157574247e-05, |
|
"loss": 0.5824, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 0.2113647393844558, |
|
"learning_rate": 1.297808222213785e-05, |
|
"loss": 0.583, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.21208258707761682, |
|
"learning_rate": 1.2916991147775867e-05, |
|
"loss": 0.5968, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.21585097977939674, |
|
"learning_rate": 1.2855780849582828e-05, |
|
"loss": 0.605, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 0.2177190382165568, |
|
"learning_rate": 1.2794453829357974e-05, |
|
"loss": 0.5917, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 0.20918133994979798, |
|
"learning_rate": 1.2733012593671235e-05, |
|
"loss": 0.5808, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 0.20727620915430647, |
|
"learning_rate": 1.2671459653760781e-05, |
|
"loss": 0.5848, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 0.21127807703832494, |
|
"learning_rate": 1.2609797525430374e-05, |
|
"loss": 0.5918, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 0.22995739968919704, |
|
"learning_rate": 1.2548028728946548e-05, |
|
"loss": 0.5788, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 0.283656419786389, |
|
"learning_rate": 1.2486155788935599e-05, |
|
"loss": 0.5696, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 0.2060037733197605, |
|
"learning_rate": 1.24241812342804e-05, |
|
"loss": 0.5679, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 0.20635796088681332, |
|
"learning_rate": 1.2362107598017037e-05, |
|
"loss": 0.5724, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 0.22467393881977182, |
|
"learning_rate": 1.2299937417231269e-05, |
|
"loss": 0.5955, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 0.2074247154565041, |
|
"learning_rate": 1.2237673232954854e-05, |
|
"loss": 0.5971, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 0.2424465336794506, |
|
"learning_rate": 1.2175317590061676e-05, |
|
"loss": 0.5781, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 0.22257500583547102, |
|
"learning_rate": 1.2112873037163728e-05, |
|
"loss": 0.5839, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 0.2066344256103609, |
|
"learning_rate": 1.2050342126506958e-05, |
|
"loss": 0.5739, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 0.22883829030621783, |
|
"learning_rate": 1.1987727413866936e-05, |
|
"loss": 0.5834, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 0.2078381748570218, |
|
"learning_rate": 1.1925031458444416e-05, |
|
"loss": 0.5987, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 0.19775167797301507, |
|
"learning_rate": 1.1862256822760704e-05, |
|
"loss": 0.6014, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 0.22122261973414395, |
|
"learning_rate": 1.1799406072552963e-05, |
|
"loss": 0.6051, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.2048271696065503, |
|
"learning_rate": 1.1736481776669307e-05, |
|
"loss": 0.5836, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 0.2089570875625985, |
|
"learning_rate": 1.1673486506963824e-05, |
|
"loss": 0.5969, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 0.20686459307988367, |
|
"learning_rate": 1.1610422838191473e-05, |
|
"loss": 0.5838, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 0.2011184871623206, |
|
"learning_rate": 1.1547293347902813e-05, |
|
"loss": 0.5809, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 0.20796307453083696, |
|
"learning_rate": 1.148410061633869e-05, |
|
"loss": 0.577, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 0.21285626663848575, |
|
"learning_rate": 1.1420847226324746e-05, |
|
"loss": 0.5842, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 0.21073449630208443, |
|
"learning_rate": 1.135753576316588e-05, |
|
"loss": 0.5631, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 0.21419628737948346, |
|
"learning_rate": 1.1294168814540554e-05, |
|
"loss": 0.5725, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 0.21145643817601928, |
|
"learning_rate": 1.1230748970395056e-05, |
|
"loss": 0.5841, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 0.21985931289420635, |
|
"learning_rate": 1.1167278822837621e-05, |
|
"loss": 0.5921, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 0.21769355752757327, |
|
"learning_rate": 1.1103760966032497e-05, |
|
"loss": 0.5969, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 0.205702651788512, |
|
"learning_rate": 1.1040197996093915e-05, |
|
"loss": 0.5998, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 0.2123850240700951, |
|
"learning_rate": 1.0976592510979982e-05, |
|
"loss": 0.602, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 0.2080343816532754, |
|
"learning_rate": 1.0912947110386484e-05, |
|
"loss": 0.5779, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 0.24850849809790562, |
|
"learning_rate": 1.084926439564065e-05, |
|
"loss": 0.6032, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 0.21564897773144823, |
|
"learning_rate": 1.0785546969594813e-05, |
|
"loss": 0.5869, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 0.20762144286700152, |
|
"learning_rate": 1.0721797436520044e-05, |
|
"loss": 0.5826, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 0.2063081386274062, |
|
"learning_rate": 1.0658018401999681e-05, |
|
"loss": 0.5704, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.20663776641786638, |
|
"learning_rate": 1.0594212472822865e-05, |
|
"loss": 0.5699, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.2041445625688445, |
|
"learning_rate": 1.053038225687798e-05, |
|
"loss": 0.5844, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 0.2130137089909296, |
|
"learning_rate": 1.0466530363046057e-05, |
|
"loss": 0.5863, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 0.20861757470403922, |
|
"learning_rate": 1.0402659401094154e-05, |
|
"loss": 0.5967, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 0.2027441783860282, |
|
"learning_rate": 1.033877198156868e-05, |
|
"loss": 0.5897, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 0.213846779001926, |
|
"learning_rate": 1.0274870715688713e-05, |
|
"loss": 0.6025, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 0.2127779802918096, |
|
"learning_rate": 1.0210958215239249e-05, |
|
"loss": 0.6056, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 0.20745193969187556, |
|
"learning_rate": 1.0147037092464469e-05, |
|
"loss": 0.577, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 0.21182819095935948, |
|
"learning_rate": 1.0083109959960974e-05, |
|
"loss": 0.5864, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 0.19696271189726422, |
|
"learning_rate": 1.0019179430570984e-05, |
|
"loss": 0.5928, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 0.20879291422466129, |
|
"learning_rate": 9.955248117275566e-06, |
|
"loss": 0.5759, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 0.20354233979841574, |
|
"learning_rate": 9.891318633087831e-06, |
|
"loss": 0.5752, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 0.20873959818273613, |
|
"learning_rate": 9.827393590946116e-06, |
|
"loss": 0.5781, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 0.20270163619729287, |
|
"learning_rate": 9.763475603607215e-06, |
|
"loss": 0.5766, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 0.20654005706496942, |
|
"learning_rate": 9.699567283539567e-06, |
|
"loss": 0.5681, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 0.20618557107103305, |
|
"learning_rate": 9.635671242816503e-06, |
|
"loss": 0.609, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 0.21061722996416557, |
|
"learning_rate": 9.571790093009445e-06, |
|
"loss": 0.5934, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 0.21093457717343486, |
|
"learning_rate": 9.50792644508122e-06, |
|
"loss": 0.581, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 0.21238370099560738, |
|
"learning_rate": 9.44408290927929e-06, |
|
"loss": 0.5739, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 0.2164905762404212, |
|
"learning_rate": 9.380262095029113e-06, |
|
"loss": 0.5927, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 0.20454952313454147, |
|
"learning_rate": 9.316466610827446e-06, |
|
"loss": 0.5873, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 0.20592385724986123, |
|
"learning_rate": 9.252699064135759e-06, |
|
"loss": 0.5671, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 0.2033785032533706, |
|
"learning_rate": 9.188962061273664e-06, |
|
"loss": 0.5658, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 0.21054461531122468, |
|
"learning_rate": 9.125258207312365e-06, |
|
"loss": 0.5792, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 0.21120511900125027, |
|
"learning_rate": 9.061590105968208e-06, |
|
"loss": 0.5854, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 0.20830877482379653, |
|
"learning_rate": 8.997960359496248e-06, |
|
"loss": 0.5826, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 0.20288306787814717, |
|
"learning_rate": 8.934371568583893e-06, |
|
"loss": 0.5706, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.21694924382055683, |
|
"learning_rate": 8.8708263322446e-06, |
|
"loss": 0.5951, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.20903099761718094, |
|
"learning_rate": 8.807327247711667e-06, |
|
"loss": 0.5824, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 0.20063759680689586, |
|
"learning_rate": 8.743876910332057e-06, |
|
"loss": 0.5614, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 0.20409842230229638, |
|
"learning_rate": 8.680477913460339e-06, |
|
"loss": 0.5946, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 0.20466612084153943, |
|
"learning_rate": 8.617132848352672e-06, |
|
"loss": 0.5652, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 0.20697633678427196, |
|
"learning_rate": 8.553844304060908e-06, |
|
"loss": 0.5812, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 0.21720588749316946, |
|
"learning_rate": 8.490614867326775e-06, |
|
"loss": 0.6117, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 0.20226043800686028, |
|
"learning_rate": 8.427447122476148e-06, |
|
"loss": 0.5813, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 0.20240033305212438, |
|
"learning_rate": 8.364343651313406e-06, |
|
"loss": 0.5838, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.20457722466303066, |
|
"learning_rate": 8.301307033015928e-06, |
|
"loss": 0.5838, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.21113002739789974, |
|
"learning_rate": 8.23833984402868e-06, |
|
"loss": 0.5642, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 0.20884646327430137, |
|
"learning_rate": 8.175444657958875e-06, |
|
"loss": 0.5887, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 0.2117374486270568, |
|
"learning_rate": 8.112624045470834e-06, |
|
"loss": 0.5586, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 0.2153924095751537, |
|
"learning_rate": 8.04988057418088e-06, |
|
"loss": 0.5586, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 0.2242880757841404, |
|
"learning_rate": 7.987216808552409e-06, |
|
"loss": 0.584, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 0.2052097826707033, |
|
"learning_rate": 7.924635309791065e-06, |
|
"loss": 0.5734, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 0.20970207818516165, |
|
"learning_rate": 7.862138635740078e-06, |
|
"loss": 0.5794, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 0.22854744935308324, |
|
"learning_rate": 7.799729340775688e-06, |
|
"loss": 0.5665, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 0.21044359070616686, |
|
"learning_rate": 7.73740997570278e-06, |
|
"loss": 0.5798, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 0.22708073388787278, |
|
"learning_rate": 7.675183087650592e-06, |
|
"loss": 0.5801, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 0.2033282205763091, |
|
"learning_rate": 7.613051219968624e-06, |
|
"loss": 0.5839, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 0.20524876955037163, |
|
"learning_rate": 7.551016912122692e-06, |
|
"loss": 0.5669, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 0.2084171272109649, |
|
"learning_rate": 7.489082699591128e-06, |
|
"loss": 0.5772, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 0.2045659246543415, |
|
"learning_rate": 7.4272511137611405e-06, |
|
"loss": 0.5888, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 0.2238357346707814, |
|
"learning_rate": 7.3655246818253626e-06, |
|
"loss": 0.5778, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 0.20675904524387453, |
|
"learning_rate": 7.303905926678565e-06, |
|
"loss": 0.5716, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 0.19722673110072383, |
|
"learning_rate": 7.242397366814516e-06, |
|
"loss": 0.5807, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 0.2048782845337188, |
|
"learning_rate": 7.181001516223074e-06, |
|
"loss": 0.5826, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 0.20915644349360044, |
|
"learning_rate": 7.1197208842874175e-06, |
|
"loss": 0.5604, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 0.20273160113226266, |
|
"learning_rate": 7.058557975681488e-06, |
|
"loss": 0.5333, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 0.20926689973709214, |
|
"learning_rate": 6.997515290267611e-06, |
|
"loss": 0.5946, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 0.21243148030222134, |
|
"learning_rate": 6.936595322994328e-06, |
|
"loss": 0.5704, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 0.2322671841870559, |
|
"learning_rate": 6.8758005637944245e-06, |
|
"loss": 0.5672, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 0.20191574871457757, |
|
"learning_rate": 6.815133497483157e-06, |
|
"loss": 0.5531, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 0.1979263709848421, |
|
"learning_rate": 6.754596603656687e-06, |
|
"loss": 0.5855, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 0.20297785849664604, |
|
"learning_rate": 6.694192356590743e-06, |
|
"loss": 0.5611, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 0.2057255315804723, |
|
"learning_rate": 6.633923225139498e-06, |
|
"loss": 0.5603, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 0.2039103225151876, |
|
"learning_rate": 6.573791672634638e-06, |
|
"loss": 0.564, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 0.20729859585281207, |
|
"learning_rate": 6.513800156784709e-06, |
|
"loss": 0.5665, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 0.21039586635611288, |
|
"learning_rate": 6.453951129574644e-06, |
|
"loss": 0.5731, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 0.2517268650469455, |
|
"learning_rate": 6.394247037165559e-06, |
|
"loss": 0.5895, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 0.20420292346101634, |
|
"learning_rate": 6.3346903197947564e-06, |
|
"loss": 0.5822, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 0.2005713051020868, |
|
"learning_rate": 6.275283411676008e-06, |
|
"loss": 0.5747, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 0.20383448396361142, |
|
"learning_rate": 6.216028740900042e-06, |
|
"loss": 0.564, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 0.20518916264594467, |
|
"learning_rate": 6.1569287293353274e-06, |
|
"loss": 0.569, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 0.20991517111604094, |
|
"learning_rate": 6.097985792529055e-06, |
|
"loss": 0.5776, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.20901449217066023, |
|
"learning_rate": 6.039202339608432e-06, |
|
"loss": 0.5701, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.6639273166656494, |
|
"eval_runtime": 13.2474, |
|
"eval_samples_per_second": 104.021, |
|
"eval_steps_per_second": 0.83, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 0.22019281423746326, |
|
"learning_rate": 5.980580773182214e-06, |
|
"loss": 0.522, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 0.20628500931618077, |
|
"learning_rate": 5.922123489242499e-06, |
|
"loss": 0.5217, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 0.21214052920599313, |
|
"learning_rate": 5.8638328770667905e-06, |
|
"loss": 0.5158, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 0.2125306506706992, |
|
"learning_rate": 5.805711319120358e-06, |
|
"loss": 0.5266, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 0.20497451303661474, |
|
"learning_rate": 5.747761190958859e-06, |
|
"loss": 0.5379, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 0.20908534530239564, |
|
"learning_rate": 5.689984861131221e-06, |
|
"loss": 0.5235, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 0.21454631325571177, |
|
"learning_rate": 5.632384691082874e-06, |
|
"loss": 0.559, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 0.20512681342425096, |
|
"learning_rate": 5.5749630350592e-06, |
|
"loss": 0.5112, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 0.21863556347579605, |
|
"learning_rate": 5.517722240009319e-06, |
|
"loss": 0.5252, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 0.2042884660329648, |
|
"learning_rate": 5.460664645490172e-06, |
|
"loss": 0.5319, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 0.20585363297531442, |
|
"learning_rate": 5.403792583570884e-06, |
|
"loss": 0.5486, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 0.21017174229067934, |
|
"learning_rate": 5.347108378737469e-06, |
|
"loss": 0.5219, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 0.20323326084962492, |
|
"learning_rate": 5.290614347797802e-06, |
|
"loss": 0.5304, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 0.20641023175360712, |
|
"learning_rate": 5.234312799786921e-06, |
|
"loss": 0.5078, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 0.20699085777952853, |
|
"learning_rate": 5.1782060358726885e-06, |
|
"loss": 0.541, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 0.21849299793363283, |
|
"learning_rate": 5.122296349261695e-06, |
|
"loss": 0.5382, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 0.2225675454375723, |
|
"learning_rate": 5.066586025105558e-06, |
|
"loss": 0.5222, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 0.21820736824894094, |
|
"learning_rate": 5.011077340407509e-06, |
|
"loss": 0.521, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 0.21040780570604573, |
|
"learning_rate": 4.955772563929334e-06, |
|
"loss": 0.5138, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"grad_norm": 0.20328121606226746, |
|
"learning_rate": 4.900673956098644e-06, |
|
"loss": 0.5, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 0.20307162400180162, |
|
"learning_rate": 4.845783768916482e-06, |
|
"loss": 0.5205, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 0.2145754172951753, |
|
"learning_rate": 4.79110424586528e-06, |
|
"loss": 0.5361, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 0.22278545561346286, |
|
"learning_rate": 4.736637621817176e-06, |
|
"loss": 0.5415, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 0.20777015428265447, |
|
"learning_rate": 4.682386122942649e-06, |
|
"loss": 0.5466, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 0.20476960423981388, |
|
"learning_rate": 4.628351966619531e-06, |
|
"loss": 0.5356, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 0.2026935858356193, |
|
"learning_rate": 4.5745373613424075e-06, |
|
"loss": 0.5183, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 0.20396283481465377, |
|
"learning_rate": 4.520944506632314e-06, |
|
"loss": 0.5282, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 0.20723703288940273, |
|
"learning_rate": 4.467575592946865e-06, |
|
"loss": 0.5353, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 0.20283509715409914, |
|
"learning_rate": 4.414432801590703e-06, |
|
"loss": 0.5253, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 0.2052800409715183, |
|
"learning_rate": 4.361518304626366e-06, |
|
"loss": 0.5144, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 0.19650849856593194, |
|
"learning_rate": 4.308834264785483e-06, |
|
"loss": 0.515, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 0.20863894389929338, |
|
"learning_rate": 4.256382835380421e-06, |
|
"loss": 0.5202, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 0.20698871916210282, |
|
"learning_rate": 4.204166160216216e-06, |
|
"loss": 0.5341, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 0.21288394644934694, |
|
"learning_rate": 4.1521863735030065e-06, |
|
"loss": 0.5428, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 0.21106167476818527, |
|
"learning_rate": 4.100445599768774e-06, |
|
"loss": 0.522, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 0.20964196528234977, |
|
"learning_rate": 4.048945953772504e-06, |
|
"loss": 0.5294, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 0.2186729758235947, |
|
"learning_rate": 3.99768954041778e-06, |
|
"loss": 0.5258, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 0.21074477316001103, |
|
"learning_rate": 3.946678454666719e-06, |
|
"loss": 0.5296, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 0.2082023730818782, |
|
"learning_rate": 3.89591478145437e-06, |
|
"loss": 0.5232, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 0.211665046662059, |
|
"learning_rate": 3.845400595603482e-06, |
|
"loss": 0.5325, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 0.20452212904152248, |
|
"learning_rate": 3.79513796173971e-06, |
|
"loss": 0.5208, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 0.20097361934014168, |
|
"learning_rate": 3.745128934207225e-06, |
|
"loss": 0.5154, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 0.20835101950643772, |
|
"learning_rate": 3.695375556984764e-06, |
|
"loss": 0.5221, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 0.20751752347710595, |
|
"learning_rate": 3.6458798636020477e-06, |
|
"loss": 0.5208, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 0.2083029476627408, |
|
"learning_rate": 3.59664387705672e-06, |
|
"loss": 0.524, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 0.20547050805474043, |
|
"learning_rate": 3.5476696097316253e-06, |
|
"loss": 0.5224, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 0.20721249112618484, |
|
"learning_rate": 3.4989590633125583e-06, |
|
"loss": 0.5335, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 0.2084694829774017, |
|
"learning_rate": 3.450514228706482e-06, |
|
"loss": 0.5229, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 0.20216282130770613, |
|
"learning_rate": 3.4023370859601192e-06, |
|
"loss": 0.5375, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 0.2036489936294621, |
|
"learning_rate": 3.3544296041790457e-06, |
|
"loss": 0.5123, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"grad_norm": 0.20447914104651402, |
|
"learning_rate": 3.3067937414471986e-06, |
|
"loss": 0.5283, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 0.20966243005652113, |
|
"learning_rate": 3.2594314447468457e-06, |
|
"loss": 0.5207, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 0.20579910881393176, |
|
"learning_rate": 3.2123446498790214e-06, |
|
"loss": 0.518, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 0.19982035514392868, |
|
"learning_rate": 3.1655352813843886e-06, |
|
"loss": 0.53, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 0.2114271078042674, |
|
"learning_rate": 3.1190052524645752e-06, |
|
"loss": 0.5297, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 0.20832244818822046, |
|
"learning_rate": 3.0727564649040066e-06, |
|
"loss": 0.5102, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 0.2008612540571781, |
|
"learning_rate": 3.0267908089921438e-06, |
|
"loss": 0.525, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 0.21335420821701767, |
|
"learning_rate": 2.9811101634462414e-06, |
|
"loss": 0.5161, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 0.20450727458510107, |
|
"learning_rate": 2.93571639533455e-06, |
|
"loss": 0.5139, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 0.213686615068082, |
|
"learning_rate": 2.8906113600000153e-06, |
|
"loss": 0.5203, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 0.20772690016178588, |
|
"learning_rate": 2.8457969009844354e-06, |
|
"loss": 0.5437, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 0.20632381814936407, |
|
"learning_rate": 2.8012748499531195e-06, |
|
"loss": 0.5318, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 0.2016664102243559, |
|
"learning_rate": 2.7570470266200177e-06, |
|
"loss": 0.5165, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 0.20520163431623825, |
|
"learning_rate": 2.713115238673356e-06, |
|
"loss": 0.5193, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 0.214071163971577, |
|
"learning_rate": 2.669481281701739e-06, |
|
"loss": 0.5268, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 0.20652575043829738, |
|
"learning_rate": 2.626146939120757e-06, |
|
"loss": 0.5176, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 0.20623703636177199, |
|
"learning_rate": 2.5831139821001184e-06, |
|
"loss": 0.5194, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 0.2073488736443259, |
|
"learning_rate": 2.5403841694912333e-06, |
|
"loss": 0.521, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 0.20428316103740377, |
|
"learning_rate": 2.497959247755335e-06, |
|
"loss": 0.522, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 0.20624815834176138, |
|
"learning_rate": 2.455840950892099e-06, |
|
"loss": 0.5338, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 0.2107922125349263, |
|
"learning_rate": 2.414031000368767e-06, |
|
"loss": 0.5381, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 0.20453939515184638, |
|
"learning_rate": 2.372531105049789e-06, |
|
"loss": 0.5183, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 0.19793223071462618, |
|
"learning_rate": 2.331342961126988e-06, |
|
"loss": 0.5113, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"grad_norm": 0.21378964248804647, |
|
"learning_rate": 2.290468252050204e-06, |
|
"loss": 0.5296, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"grad_norm": 0.22063310823848487, |
|
"learning_rate": 2.2499086484585255e-06, |
|
"loss": 0.5169, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 0.20381232643959812, |
|
"learning_rate": 2.2096658081119793e-06, |
|
"loss": 0.5196, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 0.21044163878960068, |
|
"learning_rate": 2.1697413758237785e-06, |
|
"loss": 0.5373, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"grad_norm": 0.20577762743347522, |
|
"learning_rate": 2.130136983393112e-06, |
|
"loss": 0.5095, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"grad_norm": 0.20351905534806264, |
|
"learning_rate": 2.0908542495384276e-06, |
|
"loss": 0.5238, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 0.19865856316493957, |
|
"learning_rate": 2.051894779831286e-06, |
|
"loss": 0.516, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"grad_norm": 0.19763372174700705, |
|
"learning_rate": 2.0132601666307295e-06, |
|
"loss": 0.5073, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"grad_norm": 0.19981546411123563, |
|
"learning_rate": 1.9749519890182035e-06, |
|
"loss": 0.5233, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 0.2033638338024947, |
|
"learning_rate": 1.936971812733012e-06, |
|
"loss": 0.5023, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 0.20822189802082794, |
|
"learning_rate": 1.8993211901083353e-06, |
|
"loss": 0.5009, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"grad_norm": 0.2079466895136193, |
|
"learning_rate": 1.8620016600077516e-06, |
|
"loss": 0.5256, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"grad_norm": 0.20439899215227353, |
|
"learning_rate": 1.8250147477623836e-06, |
|
"loss": 0.5365, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 0.22075705014027322, |
|
"learning_rate": 1.7883619651085194e-06, |
|
"loss": 0.5168, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 0.1964170868119619, |
|
"learning_rate": 1.7520448101258325e-06, |
|
"loss": 0.5173, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"grad_norm": 0.2018340951265649, |
|
"learning_rate": 1.716064767176172e-06, |
|
"loss": 0.5208, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"grad_norm": 0.20205110525587222, |
|
"learning_rate": 1.6804233068428678e-06, |
|
"loss": 0.5321, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.21357970053476708, |
|
"learning_rate": 1.6451218858706374e-06, |
|
"loss": 0.5203, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 0.2060738726003442, |
|
"learning_rate": 1.6101619471060415e-06, |
|
"loss": 0.5206, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 0.20422451049362417, |
|
"learning_rate": 1.5755449194385164e-06, |
|
"loss": 0.5183, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 0.2002944726641853, |
|
"learning_rate": 1.5412722177419658e-06, |
|
"loss": 0.5315, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 0.20080246126740545, |
|
"learning_rate": 1.5073452428169444e-06, |
|
"loss": 0.5302, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 0.20361752591083399, |
|
"learning_rate": 1.4737653813333774e-06, |
|
"loss": 0.5178, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 0.19999788848075653, |
|
"learning_rate": 1.4405340057739203e-06, |
|
"loss": 0.5192, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 0.1959282126715599, |
|
"learning_rate": 1.407652474377832e-06, |
|
"loss": 0.5156, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 0.20774381542552106, |
|
"learning_rate": 1.3751221310854778e-06, |
|
"loss": 0.5056, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 0.2038197815633597, |
|
"learning_rate": 1.3429443054833913e-06, |
|
"loss": 0.5265, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 0.20325668884167009, |
|
"learning_rate": 1.311120312749935e-06, |
|
"loss": 0.5234, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 0.20390134033380405, |
|
"learning_rate": 1.2796514536015492e-06, |
|
"loss": 0.5368, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 2.57, |
|
"grad_norm": 0.20166563167725796, |
|
"learning_rate": 1.2485390142395793e-06, |
|
"loss": 0.5205, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 2.57, |
|
"grad_norm": 0.2022982713597928, |
|
"learning_rate": 1.2177842662977136e-06, |
|
"loss": 0.5231, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 0.21218295817596197, |
|
"learning_rate": 1.1873884667900125e-06, |
|
"loss": 0.5439, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 0.20473864934409344, |
|
"learning_rate": 1.1573528580595195e-06, |
|
"loss": 0.521, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"grad_norm": 0.20949092154655793, |
|
"learning_rate": 1.1276786677274866e-06, |
|
"loss": 0.5105, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"grad_norm": 0.20094255278913592, |
|
"learning_rate": 1.0983671086432146e-06, |
|
"loss": 0.5128, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 0.2057662784043456, |
|
"learning_rate": 1.069419378834461e-06, |
|
"loss": 0.5385, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 0.20176044390537098, |
|
"learning_rate": 1.040836661458482e-06, |
|
"loss": 0.5171, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"grad_norm": 0.21932275472178112, |
|
"learning_rate": 1.0126201247536783e-06, |
|
"loss": 0.528, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 0.19746784289323338, |
|
"learning_rate": 9.8477092199184e-07, |
|
"loss": 0.511, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 0.20034469208109756, |
|
"learning_rate": 9.57290191431013e-07, |
|
"loss": 0.5227, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 0.20104497306207902, |
|
"learning_rate": 9.301790562689794e-07, |
|
"loss": 0.5277, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 0.1972303787668993, |
|
"learning_rate": 9.034386245973359e-07, |
|
"loss": 0.5147, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 0.20641734055273483, |
|
"learning_rate": 8.770699893562273e-07, |
|
"loss": 0.5256, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 0.2083707616895028, |
|
"learning_rate": 8.510742282896545e-07, |
|
"loss": 0.5154, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 0.20003452385731113, |
|
"learning_rate": 8.254524039014289e-07, |
|
"loss": 0.5333, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 0.20109885864318555, |
|
"learning_rate": 8.002055634117578e-07, |
|
"loss": 0.5088, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"grad_norm": 0.20645302721542735, |
|
"learning_rate": 7.753347387144294e-07, |
|
"loss": 0.5126, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"grad_norm": 0.20323590104576408, |
|
"learning_rate": 7.508409463346389e-07, |
|
"loss": 0.5234, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 0.20483041641048474, |
|
"learning_rate": 7.26725187387446e-07, |
|
"loss": 0.5207, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 0.20363185231425762, |
|
"learning_rate": 7.029884475368542e-07, |
|
"loss": 0.5234, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 0.19585624352673048, |
|
"learning_rate": 6.796316969555205e-07, |
|
"loss": 0.516, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 0.21163098285472737, |
|
"learning_rate": 6.566558902851161e-07, |
|
"loss": 0.5382, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 0.2061275767234072, |
|
"learning_rate": 6.340619665972847e-07, |
|
"loss": 0.5252, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"grad_norm": 0.2050927030534903, |
|
"learning_rate": 6.118508493552866e-07, |
|
"loss": 0.5081, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"grad_norm": 0.20411513952463778, |
|
"learning_rate": 5.900234463762367e-07, |
|
"loss": 0.5248, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 0.20399794959320638, |
|
"learning_rate": 5.685806497940027e-07, |
|
"loss": 0.5163, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 0.20891752353282228, |
|
"learning_rate": 5.475233360227516e-07, |
|
"loss": 0.529, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"grad_norm": 0.2005044795674912, |
|
"learning_rate": 5.268523657211188e-07, |
|
"loss": 0.5009, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 0.2046930603550785, |
|
"learning_rate": 5.065685837570312e-07, |
|
"loss": 0.5165, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 0.20175610856641593, |
|
"learning_rate": 4.866728191731829e-07, |
|
"loss": 0.5182, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 0.2030309299250094, |
|
"learning_rate": 4.671658851531424e-07, |
|
"loss": 0.5199, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 0.1914622288737903, |
|
"learning_rate": 4.480485789881217e-07, |
|
"loss": 0.5177, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 0.19668048668936855, |
|
"learning_rate": 4.293216820443891e-07, |
|
"loss": 0.5308, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 0.22039481521735169, |
|
"learning_rate": 4.109859597313237e-07, |
|
"loss": 0.5283, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 0.20757846010886538, |
|
"learning_rate": 3.9304216147014853e-07, |
|
"loss": 0.5347, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 0.20740377627342962, |
|
"learning_rate": 3.7549102066328226e-07, |
|
"loss": 0.5307, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"grad_norm": 0.2100472925176084, |
|
"learning_rate": 3.5833325466437697e-07, |
|
"loss": 0.5223, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"grad_norm": 0.20177360779386036, |
|
"learning_rate": 3.4156956474898805e-07, |
|
"loss": 0.5255, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"grad_norm": 0.20707380368033076, |
|
"learning_rate": 3.2520063608592165e-07, |
|
"loss": 0.5122, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 0.19965994990493274, |
|
"learning_rate": 3.0922713770922155e-07, |
|
"loss": 0.5214, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 0.20324516199000714, |
|
"learning_rate": 2.9364972249082747e-07, |
|
"loss": 0.5145, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 0.20150097372860662, |
|
"learning_rate": 2.7846902711389236e-07, |
|
"loss": 0.5132, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 0.2051450132081192, |
|
"learning_rate": 2.636856720467573e-07, |
|
"loss": 0.5128, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 0.20266490344541718, |
|
"learning_rate": 2.493002615175977e-07, |
|
"loss": 0.5122, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 0.20236603858533658, |
|
"learning_rate": 2.3531338348971366e-07, |
|
"loss": 0.5327, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 0.20530608079603682, |
|
"learning_rate": 2.217256096375131e-07, |
|
"loss": 0.5099, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 0.2000522109375465, |
|
"learning_rate": 2.0853749532314006e-07, |
|
"loss": 0.5299, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 0.20808661956102248, |
|
"learning_rate": 1.9574957957377294e-07, |
|
"loss": 0.5225, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 0.2026575467689292, |
|
"learning_rate": 1.8336238505959892e-07, |
|
"loss": 0.5286, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 0.2043520049010144, |
|
"learning_rate": 1.7137641807244754e-07, |
|
"loss": 0.5218, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 0.20057729109993613, |
|
"learning_rate": 1.5979216850509848e-07, |
|
"loss": 0.5225, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 0.20086887199752657, |
|
"learning_rate": 1.4861010983126202e-07, |
|
"loss": 0.521, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 0.20600288196892624, |
|
"learning_rate": 1.3783069908621772e-07, |
|
"loss": 0.5233, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 0.2032590701020277, |
|
"learning_rate": 1.274543768481451e-07, |
|
"loss": 0.5298, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"grad_norm": 0.19937401805144794, |
|
"learning_rate": 1.1748156722011128e-07, |
|
"loss": 0.5012, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"grad_norm": 0.2021678865177686, |
|
"learning_rate": 1.0791267781273263e-07, |
|
"loss": 0.5242, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 0.20777988223157803, |
|
"learning_rate": 9.874809972752697e-08, |
|
"loss": 0.5293, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 0.20368366862991866, |
|
"learning_rate": 8.99882075409153e-08, |
|
"loss": 0.5153, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"grad_norm": 0.20787994510400237, |
|
"learning_rate": 8.16333592889207e-08, |
|
"loss": 0.5345, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 0.20946081246812162, |
|
"learning_rate": 7.368389645252772e-08, |
|
"loss": 0.5229, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 0.20571639236209135, |
|
"learning_rate": 6.61401439437348e-08, |
|
"loss": 0.5151, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 0.21182777269953695, |
|
"learning_rate": 5.9002410092262593e-08, |
|
"loss": 0.5207, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 0.1955808209671619, |
|
"learning_rate": 5.227098663296404e-08, |
|
"loss": 0.5073, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 0.20179292685793432, |
|
"learning_rate": 4.594614869388947e-08, |
|
"loss": 0.5212, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 0.20236344715337046, |
|
"learning_rate": 4.002815478505007e-08, |
|
"loss": 0.5208, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"grad_norm": 0.2112521255560089, |
|
"learning_rate": 3.451724678784518e-08, |
|
"loss": 0.5155, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"grad_norm": 0.1988319336416073, |
|
"learning_rate": 2.9413649945182475e-08, |
|
"loss": 0.5149, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 0.2393477878018782, |
|
"learning_rate": 2.47175728522675e-08, |
|
"loss": 0.5196, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 0.1989296468037607, |
|
"learning_rate": 2.0429207448078302e-08, |
|
"loss": 0.4954, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 0.21328717756995305, |
|
"learning_rate": 1.654872900752169e-08, |
|
"loss": 0.5211, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 0.2043004896942241, |
|
"learning_rate": 1.3076296134271194e-08, |
|
"loss": 0.5298, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 0.20203087493728625, |
|
"learning_rate": 1.0012050754277802e-08, |
|
"loss": 0.5142, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"grad_norm": 0.20708771065311932, |
|
"learning_rate": 7.356118109977939e-09, |
|
"loss": 0.5352, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"grad_norm": 0.20602452029141277, |
|
"learning_rate": 5.108606755168666e-09, |
|
"loss": 0.528, |
|
"step": 2705 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"grad_norm": 0.20878635383712413, |
|
"learning_rate": 3.269608550571235e-09, |
|
"loss": 0.5155, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"grad_norm": 0.1972058695515002, |
|
"learning_rate": 1.839198660079644e-09, |
|
"loss": 0.5121, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"grad_norm": 0.2017859212074039, |
|
"learning_rate": 8.174355476864293e-10, |
|
"loss": 0.5193, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"grad_norm": 0.20180951712274753, |
|
"learning_rate": 2.0436097509235475e-10, |
|
"loss": 0.5331, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.2006750130728712, |
|
"learning_rate": 0.0, |
|
"loss": 0.5227, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.6613607406616211, |
|
"eval_runtime": 13.2749, |
|
"eval_samples_per_second": 103.805, |
|
"eval_steps_per_second": 0.829, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 2730, |
|
"total_flos": 1.183665069490176e+16, |
|
"train_loss": 0.6159939037574517, |
|
"train_runtime": 13745.4884, |
|
"train_samples_per_second": 25.399, |
|
"train_steps_per_second": 0.199 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 2730, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 100, |
|
"total_flos": 1.183665069490176e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|