|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 1949, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 1.7453284027716789, |
|
"learning_rate": 1.0256410256410257e-06, |
|
"loss": 4.8029, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 1.8838630132500438, |
|
"learning_rate": 5.128205128205128e-06, |
|
"loss": 4.8678, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.9750863683254836, |
|
"learning_rate": 1.0256410256410256e-05, |
|
"loss": 4.7915, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 2.0960739711216374, |
|
"learning_rate": 1.5384615384615387e-05, |
|
"loss": 4.7429, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 2.68908841814168, |
|
"learning_rate": 2.0512820512820512e-05, |
|
"loss": 4.655, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.4517244544448675, |
|
"learning_rate": 2.564102564102564e-05, |
|
"loss": 4.3514, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 4.136057917635942, |
|
"learning_rate": 3.0769230769230774e-05, |
|
"loss": 3.905, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 4.589117875619624, |
|
"learning_rate": 3.58974358974359e-05, |
|
"loss": 3.2509, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 4.927853133671092, |
|
"learning_rate": 4.1025641025641023e-05, |
|
"loss": 2.6795, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 3.054252767001273, |
|
"learning_rate": 4.615384615384616e-05, |
|
"loss": 2.187, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.9849933991408988, |
|
"learning_rate": 5.128205128205128e-05, |
|
"loss": 1.9199, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.8072063144215404, |
|
"learning_rate": 5.6410256410256414e-05, |
|
"loss": 1.7439, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.3923291097565402, |
|
"learning_rate": 6.153846153846155e-05, |
|
"loss": 1.7219, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.1768172611689185, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 1.6276, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.1476923157725187, |
|
"learning_rate": 7.17948717948718e-05, |
|
"loss": 1.5326, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.9441328628031238, |
|
"learning_rate": 7.692307692307693e-05, |
|
"loss": 1.5766, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.8493903573919569, |
|
"learning_rate": 8.205128205128205e-05, |
|
"loss": 1.4017, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.0349555509222987, |
|
"learning_rate": 8.717948717948718e-05, |
|
"loss": 1.447, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.8454953032919131, |
|
"learning_rate": 9.230769230769232e-05, |
|
"loss": 1.4113, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.0220924006013166, |
|
"learning_rate": 9.743589743589744e-05, |
|
"loss": 1.4865, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.9739811152642958, |
|
"learning_rate": 0.00010256410256410256, |
|
"loss": 1.4709, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.8989088188815814, |
|
"learning_rate": 0.0001076923076923077, |
|
"loss": 1.4335, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.9043247030909609, |
|
"learning_rate": 0.00011282051282051283, |
|
"loss": 1.4196, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.0554112608560857, |
|
"learning_rate": 0.00011794871794871796, |
|
"loss": 1.376, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.9282876975712975, |
|
"learning_rate": 0.0001230769230769231, |
|
"loss": 1.4137, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.8957189154484865, |
|
"learning_rate": 0.00012820512820512823, |
|
"loss": 1.3528, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.3941737136229102, |
|
"learning_rate": 0.00013333333333333334, |
|
"loss": 1.3963, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.7436200394991389, |
|
"learning_rate": 0.00013846153846153847, |
|
"loss": 1.3047, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.1477489067913107, |
|
"learning_rate": 0.0001435897435897436, |
|
"loss": 1.3011, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.8722003799830739, |
|
"learning_rate": 0.00014871794871794872, |
|
"loss": 1.2378, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.1769840880132467, |
|
"learning_rate": 0.00015384615384615385, |
|
"loss": 1.4124, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.7399233026092019, |
|
"learning_rate": 0.00015897435897435896, |
|
"loss": 1.3027, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.9538756518623174, |
|
"learning_rate": 0.0001641025641025641, |
|
"loss": 1.2812, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.987821734343656, |
|
"learning_rate": 0.00016923076923076923, |
|
"loss": 1.3357, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.9972992684207476, |
|
"learning_rate": 0.00017435897435897436, |
|
"loss": 1.3845, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.9162912417797362, |
|
"learning_rate": 0.0001794871794871795, |
|
"loss": 1.4145, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.9727023032566935, |
|
"learning_rate": 0.00018461538461538463, |
|
"loss": 1.2514, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.1205678233237835, |
|
"learning_rate": 0.00018974358974358974, |
|
"loss": 1.3525, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.9769806404545222, |
|
"learning_rate": 0.00019487179487179487, |
|
"loss": 1.2291, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.8933569683133975, |
|
"learning_rate": 0.0002, |
|
"loss": 1.3543, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.9689935176055495, |
|
"learning_rate": 0.00019999598996948235, |
|
"loss": 1.2744, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.9199321088853798, |
|
"learning_rate": 0.00019998396019953624, |
|
"loss": 1.3059, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.0113198660825118, |
|
"learning_rate": 0.0001999639116549566, |
|
"loss": 1.1999, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.7980098887264342, |
|
"learning_rate": 0.00019993584594364894, |
|
"loss": 1.2418, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.7449325030003408, |
|
"learning_rate": 0.0001998997653165004, |
|
"loss": 1.284, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.0156080824941, |
|
"learning_rate": 0.00019985567266719934, |
|
"loss": 1.3727, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.0387246660942182, |
|
"learning_rate": 0.00019980357153200315, |
|
"loss": 1.3186, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.1886194935509105, |
|
"learning_rate": 0.00019974346608945466, |
|
"loss": 1.2949, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.9654192561255139, |
|
"learning_rate": 0.00019967536116004698, |
|
"loss": 1.3067, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.0385716215564977, |
|
"learning_rate": 0.00019959926220583713, |
|
"loss": 1.289, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.8709732110452237, |
|
"learning_rate": 0.00019951517533000764, |
|
"loss": 1.2138, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.6847073355471284, |
|
"learning_rate": 0.00019942310727637724, |
|
"loss": 1.2844, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.051169320358427, |
|
"learning_rate": 0.00019932306542886009, |
|
"loss": 1.3289, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.8704314065526715, |
|
"learning_rate": 0.00019921505781087334, |
|
"loss": 1.3339, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.7816111152013846, |
|
"learning_rate": 0.00019909909308469398, |
|
"loss": 1.3282, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.038569084884339, |
|
"learning_rate": 0.0001989751805507637, |
|
"loss": 1.3112, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.9061282430692424, |
|
"learning_rate": 0.00019884333014694345, |
|
"loss": 1.3251, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.7513521845340044, |
|
"learning_rate": 0.00019870355244771607, |
|
"loss": 1.2893, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.910789847545989, |
|
"learning_rate": 0.00019855585866333835, |
|
"loss": 1.288, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.0532355924793768, |
|
"learning_rate": 0.00019840026063894193, |
|
"loss": 1.2805, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.0244686274469181, |
|
"learning_rate": 0.00019823677085358335, |
|
"loss": 1.2499, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.7707389186864114, |
|
"learning_rate": 0.00019806540241924317, |
|
"loss": 1.2532, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.7446042971478654, |
|
"learning_rate": 0.00019788616907977441, |
|
"loss": 1.2981, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.6848536091053609, |
|
"learning_rate": 0.00019769908520980034, |
|
"loss": 1.2565, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.7586642653181629, |
|
"learning_rate": 0.00019750416581356146, |
|
"loss": 1.2648, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.9291629579523186, |
|
"learning_rate": 0.00019730142652371236, |
|
"loss": 1.1065, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.7774039891171479, |
|
"learning_rate": 0.0001970908836000678, |
|
"loss": 1.2582, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.7535737744178963, |
|
"learning_rate": 0.00019687255392829877, |
|
"loss": 1.255, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.8232038564451061, |
|
"learning_rate": 0.0001966464550185782, |
|
"loss": 1.2246, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.7763354984102764, |
|
"learning_rate": 0.0001964126050041767, |
|
"loss": 1.1932, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.9765706941888299, |
|
"learning_rate": 0.0001961710226400081, |
|
"loss": 1.2209, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.7182663624300912, |
|
"learning_rate": 0.00019592172730112544, |
|
"loss": 1.2383, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.8481687196291695, |
|
"learning_rate": 0.00019566473898116713, |
|
"loss": 1.2911, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.7711940010123207, |
|
"learning_rate": 0.0001954000782907532, |
|
"loss": 1.231, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.061067518571626, |
|
"learning_rate": 0.00019512776645583263, |
|
"loss": 1.259, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.0715949890319147, |
|
"learning_rate": 0.00019484782531598073, |
|
"loss": 1.2712, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.774154512584797, |
|
"learning_rate": 0.00019456027732264784, |
|
"loss": 1.306, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.8172535402599491, |
|
"learning_rate": 0.00019426514553735848, |
|
"loss": 1.2734, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.9451921871881396, |
|
"learning_rate": 0.00019396245362986197, |
|
"loss": 1.231, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.9954188288045217, |
|
"learning_rate": 0.00019365222587623405, |
|
"loss": 1.2061, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.8088092576041586, |
|
"learning_rate": 0.00019333448715692995, |
|
"loss": 1.2403, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.9867132889317975, |
|
"learning_rate": 0.00019300926295478884, |
|
"loss": 1.3009, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.8580342036883377, |
|
"learning_rate": 0.0001926765793529902, |
|
"loss": 1.2371, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.813693213024289, |
|
"learning_rate": 0.00019233646303296205, |
|
"loss": 1.2767, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.8552421430267484, |
|
"learning_rate": 0.00019198894127224074, |
|
"loss": 1.2419, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.9962550179351269, |
|
"learning_rate": 0.0001916340419422837, |
|
"loss": 1.2765, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.790052217945725, |
|
"learning_rate": 0.00019127179350623372, |
|
"loss": 1.2135, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.7916779759677229, |
|
"learning_rate": 0.0001909022250166365, |
|
"loss": 1.2178, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.7297942006635301, |
|
"learning_rate": 0.00019052536611311046, |
|
"loss": 1.245, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.8857053212429089, |
|
"learning_rate": 0.00019014124701996973, |
|
"loss": 1.2657, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.726328154530024, |
|
"learning_rate": 0.00018974989854379996, |
|
"loss": 1.2481, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.0011198664456165, |
|
"learning_rate": 0.00018935135207098785, |
|
"loss": 1.2323, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.7783715496305177, |
|
"learning_rate": 0.00018894563956520374, |
|
"loss": 1.2367, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.8696181443757303, |
|
"learning_rate": 0.00018853279356483826, |
|
"loss": 1.315, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.7894223278767809, |
|
"learning_rate": 0.00018811284718039256, |
|
"loss": 1.124, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.1136345673311567, |
|
"learning_rate": 0.00018768583409182305, |
|
"loss": 1.2198, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.7243107301552627, |
|
"learning_rate": 0.00018725178854584007, |
|
"loss": 1.2523, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.6223206270124363, |
|
"learning_rate": 0.00018681074535316125, |
|
"loss": 1.2673, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.8039914099462059, |
|
"learning_rate": 0.00018636273988571991, |
|
"loss": 1.1877, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.7116186179251531, |
|
"learning_rate": 0.0001859078080738279, |
|
"loss": 1.1458, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.7310411784585837, |
|
"learning_rate": 0.00018544598640329432, |
|
"loss": 1.2417, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.7429734188864324, |
|
"learning_rate": 0.00018497731191249894, |
|
"loss": 1.2852, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.9083872621008465, |
|
"learning_rate": 0.000184501822189422, |
|
"loss": 1.2436, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.9414190517105039, |
|
"learning_rate": 0.00018401955536862948, |
|
"loss": 1.2382, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.9100118313504707, |
|
"learning_rate": 0.0001835305501282148, |
|
"loss": 1.2651, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.8682421638749216, |
|
"learning_rate": 0.00018303484568669667, |
|
"loss": 1.214, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.7124640006090736, |
|
"learning_rate": 0.00018253248179987388, |
|
"loss": 1.1703, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.7882268918908124, |
|
"learning_rate": 0.0001820234987576368, |
|
"loss": 1.1747, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.9355156994606193, |
|
"learning_rate": 0.00018150793738073602, |
|
"loss": 1.1954, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.962953079362863, |
|
"learning_rate": 0.00018098583901750867, |
|
"loss": 1.21, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.8058576401677449, |
|
"learning_rate": 0.00018045724554056214, |
|
"loss": 1.1945, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.8718736747079745, |
|
"learning_rate": 0.0001799221993434159, |
|
"loss": 1.242, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.8820093361979019, |
|
"learning_rate": 0.00017938074333710157, |
|
"loss": 1.216, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.626929459625588, |
|
"learning_rate": 0.00017883292094672128, |
|
"loss": 1.1947, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.8252467152886114, |
|
"learning_rate": 0.00017827877610796514, |
|
"loss": 1.1953, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.894675298458733, |
|
"learning_rate": 0.00017771835326358743, |
|
"loss": 1.1873, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.6648245020030112, |
|
"learning_rate": 0.00017715169735984233, |
|
"loss": 1.1293, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.8629378772284809, |
|
"learning_rate": 0.0001765788538428792, |
|
"loss": 1.288, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.9066125782712056, |
|
"learning_rate": 0.00017599986865509767, |
|
"loss": 1.2058, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.7423874071094034, |
|
"learning_rate": 0.00017541478823146327, |
|
"loss": 1.2508, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.01835592219078, |
|
"learning_rate": 0.00017482365949578302, |
|
"loss": 1.1974, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.878167603852895, |
|
"learning_rate": 0.00017422652985694237, |
|
"loss": 1.2583, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.7708769505928975, |
|
"learning_rate": 0.00017362344720510278, |
|
"loss": 1.1853, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.7652211411520762, |
|
"learning_rate": 0.00017301445990786102, |
|
"loss": 1.2707, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.7286424011748441, |
|
"learning_rate": 0.00017239961680637, |
|
"loss": 1.2044, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.6950598836326154, |
|
"learning_rate": 0.0001717789672114218, |
|
"loss": 1.3171, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.7009337184084006, |
|
"learning_rate": 0.0001711525608994927, |
|
"loss": 1.1541, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.8543565184004501, |
|
"learning_rate": 0.00017052044810875126, |
|
"loss": 1.2465, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.1754223407681135, |
|
"learning_rate": 0.00016988267953502913, |
|
"loss": 1.1644, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.8933582330947162, |
|
"learning_rate": 0.00016923930632775516, |
|
"loss": 1.1798, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.932416979264158, |
|
"learning_rate": 0.00016859038008585326, |
|
"loss": 1.1983, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.8373224826856496, |
|
"learning_rate": 0.0001679359528536041, |
|
"loss": 1.1567, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.8214365574255561, |
|
"learning_rate": 0.00016727607711647114, |
|
"loss": 1.2188, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.9351111268728677, |
|
"learning_rate": 0.00016661080579689132, |
|
"loss": 1.1873, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.8498925724649622, |
|
"learning_rate": 0.0001659401922500304, |
|
"loss": 1.1937, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.7284767681268423, |
|
"learning_rate": 0.00016526429025950424, |
|
"loss": 1.1301, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.7481373744501812, |
|
"learning_rate": 0.00016458315403306502, |
|
"loss": 1.2671, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.8920310346592031, |
|
"learning_rate": 0.0001638968381982538, |
|
"loss": 1.1528, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.7925520466506051, |
|
"learning_rate": 0.0001632053977980194, |
|
"loss": 1.2368, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.0192425967562995, |
|
"learning_rate": 0.000162508888286304, |
|
"loss": 1.2167, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.7085398144677333, |
|
"learning_rate": 0.00016180736552359553, |
|
"loss": 1.2218, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.768972223449797, |
|
"learning_rate": 0.00016110088577244773, |
|
"loss": 1.2483, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.8053989910048416, |
|
"learning_rate": 0.00016038950569296785, |
|
"loss": 1.1811, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.7498380291821676, |
|
"learning_rate": 0.00015967328233827249, |
|
"loss": 1.2625, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.7372396772067858, |
|
"learning_rate": 0.00015895227314991178, |
|
"loss": 1.1298, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.7380540614428072, |
|
"learning_rate": 0.00015822653595326275, |
|
"loss": 1.1611, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.840298145620476, |
|
"learning_rate": 0.00015749612895289152, |
|
"loss": 1.1815, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.6743184976694755, |
|
"learning_rate": 0.00015676111072788527, |
|
"loss": 1.1665, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.0773509553134593, |
|
"learning_rate": 0.00015602154022715435, |
|
"loss": 1.214, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.2157724994677412, |
|
"learning_rate": 0.0001552774767647043, |
|
"loss": 1.1757, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.8932509973388469, |
|
"learning_rate": 0.0001545289800148789, |
|
"loss": 1.1747, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.773624653232096, |
|
"learning_rate": 0.0001537761100075744, |
|
"loss": 1.1934, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.6455277482504939, |
|
"learning_rate": 0.00015301892712342482, |
|
"loss": 1.2505, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.7996020026859186, |
|
"learning_rate": 0.00015225749208895968, |
|
"loss": 1.2388, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.7876699542363711, |
|
"learning_rate": 0.0001514918659717335, |
|
"loss": 1.2573, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.7507865451214197, |
|
"learning_rate": 0.00015072211017542813, |
|
"loss": 1.2108, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.742160262755694, |
|
"learning_rate": 0.00014994828643492827, |
|
"loss": 1.1554, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.7898189151931231, |
|
"learning_rate": 0.00014917045681137026, |
|
"loss": 1.1858, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.7929109677821212, |
|
"learning_rate": 0.0001483886836871646, |
|
"loss": 1.1181, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.8366860762380138, |
|
"learning_rate": 0.00014760302976099304, |
|
"loss": 1.1683, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.8562461991337411, |
|
"learning_rate": 0.00014681355804278001, |
|
"loss": 1.1545, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.6903144224243607, |
|
"learning_rate": 0.00014602033184863913, |
|
"loss": 1.1668, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.720558400777453, |
|
"learning_rate": 0.00014522341479579533, |
|
"loss": 1.3024, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.6374665332941237, |
|
"learning_rate": 0.00014442287079748263, |
|
"loss": 1.1983, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.8700325415003359, |
|
"learning_rate": 0.00014361876405781832, |
|
"loss": 1.1604, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.876670415809109, |
|
"learning_rate": 0.00014281115906665374, |
|
"loss": 1.2463, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.9136860273140437, |
|
"learning_rate": 0.00014200012059440207, |
|
"loss": 1.1814, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.8346225178534373, |
|
"learning_rate": 0.00014118571368684383, |
|
"loss": 1.2552, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.698383520911491, |
|
"learning_rate": 0.00014036800365991008, |
|
"loss": 1.1132, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.8620628308046092, |
|
"learning_rate": 0.00013954705609444404, |
|
"loss": 1.1663, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.8119834436603902, |
|
"learning_rate": 0.00013872293683094152, |
|
"loss": 1.2139, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.7204007887604488, |
|
"learning_rate": 0.00013789571196427055, |
|
"loss": 1.193, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.9461103539326526, |
|
"learning_rate": 0.00013706544783837022, |
|
"loss": 1.1549, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.1236203572833683, |
|
"learning_rate": 0.00013623221104093025, |
|
"loss": 1.2752, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.7297339296275664, |
|
"learning_rate": 0.00013539606839805036, |
|
"loss": 1.1884, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.7684475171790612, |
|
"learning_rate": 0.00013455708696888085, |
|
"loss": 1.1928, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.7160993458180628, |
|
"learning_rate": 0.00013371533404024438, |
|
"loss": 1.2035, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.7366548665049109, |
|
"learning_rate": 0.00013287087712123962, |
|
"loss": 1.2372, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.7983969367173553, |
|
"learning_rate": 0.00013202378393782692, |
|
"loss": 1.1931, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.9470935523528393, |
|
"learning_rate": 0.00013117412242739655, |
|
"loss": 1.1274, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.0451209845718006, |
|
"learning_rate": 0.00013032196073332027, |
|
"loss": 1.2435, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.7471749313716822, |
|
"learning_rate": 0.00012946736719948607, |
|
"loss": 1.1844, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.6962269357662448, |
|
"learning_rate": 0.000128610410364817, |
|
"loss": 1.1526, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.8391163485957552, |
|
"learning_rate": 0.00012775115895777417, |
|
"loss": 1.2126, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.9175047450119641, |
|
"learning_rate": 0.00012688968189084493, |
|
"loss": 1.1199, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.8881533984263993, |
|
"learning_rate": 0.00012602604825501587, |
|
"loss": 1.1842, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.68829893427438, |
|
"learning_rate": 0.00012516032731423165, |
|
"loss": 1.1916, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.8290955836420092, |
|
"learning_rate": 0.00012429258849984014, |
|
"loss": 1.1915, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.7546478279579057, |
|
"learning_rate": 0.00012342290140502388, |
|
"loss": 1.1875, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.7857053878657014, |
|
"learning_rate": 0.00012255133577921868, |
|
"loss": 1.1883, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.7728388731432995, |
|
"learning_rate": 0.0001216779615225197, |
|
"loss": 1.1329, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.762285378503341, |
|
"learning_rate": 0.00012080284868007541, |
|
"loss": 1.1857, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.8455730497463988, |
|
"learning_rate": 0.0001199260674364699, |
|
"loss": 1.2241, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.7637030076266145, |
|
"learning_rate": 0.00011904768811009405, |
|
"loss": 1.2333, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.7399802799801922, |
|
"learning_rate": 0.00011816778114750593, |
|
"loss": 1.2126, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.7804083472026294, |
|
"learning_rate": 0.00011728641711778103, |
|
"loss": 1.2239, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.7666274480697189, |
|
"learning_rate": 0.00011640366670685248, |
|
"loss": 1.105, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.7651209631784908, |
|
"learning_rate": 0.00011551960071184195, |
|
"loss": 1.3045, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.9726990471095971, |
|
"learning_rate": 0.00011463429003538196, |
|
"loss": 1.1956, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.7087953125740096, |
|
"learning_rate": 0.000113747805679929, |
|
"loss": 1.2197, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.8944588834117093, |
|
"learning_rate": 0.00011286021874206952, |
|
"loss": 1.0926, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.0048773208945307, |
|
"learning_rate": 0.00011197160040681762, |
|
"loss": 1.2798, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.7348160099574793, |
|
"learning_rate": 0.0001110820219419062, |
|
"loss": 1.1951, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.7992861590760896, |
|
"learning_rate": 0.0001101915546920711, |
|
"loss": 1.1812, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.7458148092571341, |
|
"learning_rate": 0.00010930027007332923, |
|
"loss": 1.1236, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.7657904989795685, |
|
"learning_rate": 0.00010840823956725103, |
|
"loss": 1.1714, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.7797801904462873, |
|
"learning_rate": 0.00010751553471522757, |
|
"loss": 1.164, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.6805580259070753, |
|
"learning_rate": 0.00010662222711273279, |
|
"loss": 1.1326, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.8265666935073783, |
|
"learning_rate": 0.00010572838840358168, |
|
"loss": 1.1233, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.824127124532494, |
|
"learning_rate": 0.00010483409027418425, |
|
"loss": 1.1273, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.8666956799033239, |
|
"learning_rate": 0.00010393940444779635, |
|
"loss": 1.1172, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.7597618280586041, |
|
"learning_rate": 0.00010304440267876727, |
|
"loss": 1.2262, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.8594584043580179, |
|
"learning_rate": 0.00010214915674678523, |
|
"loss": 1.237, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.8503111582046956, |
|
"learning_rate": 0.00010125373845112034, |
|
"loss": 1.2224, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.8459974040134323, |
|
"learning_rate": 0.00010035821960486643, |
|
"loss": 1.1968, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.8236420083982484, |
|
"learning_rate": 9.946267202918157e-05, |
|
"loss": 1.1334, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.9248108629370668, |
|
"learning_rate": 9.856716754752796e-05, |
|
"loss": 1.1352, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.8808986018254903, |
|
"learning_rate": 9.767177797991155e-05, |
|
"loss": 1.1959, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.635409279473428, |
|
"learning_rate": 9.677657513712221e-05, |
|
"loss": 1.1506, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.7550156809913564, |
|
"learning_rate": 9.588163081497427e-05, |
|
"loss": 1.2308, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.7609369024863887, |
|
"learning_rate": 9.498701678854865e-05, |
|
"loss": 1.1722, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.8064136692370791, |
|
"learning_rate": 9.409280480643628e-05, |
|
"loss": 1.2279, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.7131735035318008, |
|
"learning_rate": 9.319906658498389e-05, |
|
"loss": 1.1862, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.8678942495261857, |
|
"learning_rate": 9.230587380254237e-05, |
|
"loss": 1.1531, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.6604664522870226, |
|
"learning_rate": 9.141329809371803e-05, |
|
"loss": 1.1744, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.7095675124068804, |
|
"learning_rate": 9.052141104362748e-05, |
|
"loss": 1.1311, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.7845862314567627, |
|
"learning_rate": 8.963028418215653e-05, |
|
"loss": 1.2801, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.8019276427700621, |
|
"learning_rate": 8.873998897822336e-05, |
|
"loss": 1.2425, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.0094066794890815, |
|
"learning_rate": 8.785059683404672e-05, |
|
"loss": 1.1287, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.9258556140522551, |
|
"learning_rate": 8.696217907941941e-05, |
|
"loss": 1.2059, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.9454158193155473, |
|
"learning_rate": 8.607480696598762e-05, |
|
"loss": 1.0897, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.8742232040464293, |
|
"learning_rate": 8.518855166153644e-05, |
|
"loss": 1.2234, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.8764506057333485, |
|
"learning_rate": 8.43034842442822e-05, |
|
"loss": 1.1409, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.9294531011673496, |
|
"learning_rate": 8.341967569717202e-05, |
|
"loss": 1.0986, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.8413972427762672, |
|
"learning_rate": 8.253719690219079e-05, |
|
"loss": 1.1334, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.9118769910143452, |
|
"learning_rate": 8.165611863467644e-05, |
|
"loss": 1.145, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.7880447725035846, |
|
"learning_rate": 8.077651155764387e-05, |
|
"loss": 1.1299, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.8382287781274999, |
|
"learning_rate": 7.98984462161175e-05, |
|
"loss": 1.2243, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.6697229577393602, |
|
"learning_rate": 7.902199303147363e-05, |
|
"loss": 1.2093, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.7808818157955248, |
|
"learning_rate": 7.814722229579264e-05, |
|
"loss": 1.1984, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.9135627361643314, |
|
"learning_rate": 7.727420416622144e-05, |
|
"loss": 1.2091, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.680528612137634, |
|
"learning_rate": 7.640300865934687e-05, |
|
"loss": 1.1034, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.812190629178745, |
|
"learning_rate": 7.553370564558032e-05, |
|
"loss": 1.164, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.6786859157828664, |
|
"learning_rate": 7.46663648435541e-05, |
|
"loss": 1.1711, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.0608937731498502, |
|
"learning_rate": 7.380105581452987e-05, |
|
"loss": 1.1321, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.8774620996995025, |
|
"learning_rate": 7.293784795681994e-05, |
|
"loss": 1.2073, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.8397584969858537, |
|
"learning_rate": 7.207681050022132e-05, |
|
"loss": 1.1356, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.7565443884452981, |
|
"learning_rate": 7.121801250046363e-05, |
|
"loss": 1.2799, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.6473890021488468, |
|
"learning_rate": 7.036152283367056e-05, |
|
"loss": 1.1355, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.8635483831775488, |
|
"learning_rate": 6.950741019083617e-05, |
|
"loss": 1.172, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.7635797502403529, |
|
"learning_rate": 6.865574307231575e-05, |
|
"loss": 1.1359, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.9134433746313595, |
|
"learning_rate": 6.780658978233199e-05, |
|
"loss": 1.1523, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.7862648398970986, |
|
"learning_rate": 6.696001842349702e-05, |
|
"loss": 1.1786, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.8737779184120746, |
|
"learning_rate": 6.611609689135056e-05, |
|
"loss": 1.2463, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.8231816299971835, |
|
"learning_rate": 6.527489286891459e-05, |
|
"loss": 1.0201, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.833558803199825, |
|
"learning_rate": 6.443647382126509e-05, |
|
"loss": 1.1555, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.7866154882582159, |
|
"learning_rate": 6.360090699012145e-05, |
|
"loss": 1.138, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.8309461290217522, |
|
"learning_rate": 6.27682593884535e-05, |
|
"loss": 1.0762, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.7270575882509663, |
|
"learning_rate": 6.193859779510712e-05, |
|
"loss": 1.2055, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.7267387499731184, |
|
"learning_rate": 6.111198874944845e-05, |
|
"loss": 1.1906, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.8081247294023348, |
|
"learning_rate": 6.0288498546027536e-05, |
|
"loss": 1.2243, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.8237165040439475, |
|
"learning_rate": 5.946819322926127e-05, |
|
"loss": 1.0994, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.860921836417512, |
|
"learning_rate": 5.865113858813673e-05, |
|
"loss": 1.1615, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.7995623045168802, |
|
"learning_rate": 5.783740015093484e-05, |
|
"loss": 1.1728, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.7326482406819708, |
|
"learning_rate": 5.702704317997492e-05, |
|
"loss": 1.112, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.8123434824427339, |
|
"learning_rate": 5.6220132666380635e-05, |
|
"loss": 1.0999, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.8440723192599834, |
|
"learning_rate": 5.541673332486773e-05, |
|
"loss": 1.2143, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.7328759482460477, |
|
"learning_rate": 5.4616909588553674e-05, |
|
"loss": 1.2724, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.6946255004506102, |
|
"learning_rate": 5.3820725603790346e-05, |
|
"loss": 1.2365, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.7864322469951978, |
|
"learning_rate": 5.30282452250193e-05, |
|
"loss": 1.2184, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.7860065077827799, |
|
"learning_rate": 5.223953200965055e-05, |
|
"loss": 1.177, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.7870610754641466, |
|
"learning_rate": 5.145464921296537e-05, |
|
"loss": 1.2478, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.8098199890971101, |
|
"learning_rate": 5.067365978304315e-05, |
|
"loss": 1.1132, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.7277674932711228, |
|
"learning_rate": 4.9896626355712805e-05, |
|
"loss": 1.1065, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.6702441184475941, |
|
"learning_rate": 4.912361124952948e-05, |
|
"loss": 1.1489, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.7382556968521137, |
|
"learning_rate": 4.835467646077656e-05, |
|
"loss": 1.1145, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.7012581002313999, |
|
"learning_rate": 4.7589883658493296e-05, |
|
"loss": 1.1196, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.8448006051293048, |
|
"learning_rate": 4.682929417952939e-05, |
|
"loss": 1.1081, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.6907481068653331, |
|
"learning_rate": 4.6072969023625165e-05, |
|
"loss": 1.1438, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.7998262194855656, |
|
"learning_rate": 4.532096884851978e-05, |
|
"loss": 1.1514, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.9014262005578314, |
|
"learning_rate": 4.457335396508631e-05, |
|
"loss": 1.1583, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.8456699239379848, |
|
"learning_rate": 4.383018433249464e-05, |
|
"loss": 1.1471, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.7936192686618659, |
|
"learning_rate": 4.309151955340297e-05, |
|
"loss": 1.1212, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.8376082121769935, |
|
"learning_rate": 4.2357418869177354e-05, |
|
"loss": 1.2077, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.8095128041860435, |
|
"learning_rate": 4.162794115514078e-05, |
|
"loss": 1.2169, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.7509614581437917, |
|
"learning_rate": 4.0903144915851174e-05, |
|
"loss": 1.0553, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.7839995425652196, |
|
"learning_rate": 4.018308828040924e-05, |
|
"loss": 1.0728, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.706531516175813, |
|
"learning_rate": 3.946782899779667e-05, |
|
"loss": 1.1324, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.8815003834638072, |
|
"learning_rate": 3.875742443224451e-05, |
|
"loss": 1.1515, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.6975354892323637, |
|
"learning_rate": 3.805193155863247e-05, |
|
"loss": 1.0919, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.7405117357753763, |
|
"learning_rate": 3.7351406957919636e-05, |
|
"loss": 1.1764, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.7778411867278201, |
|
"learning_rate": 3.665590681260658e-05, |
|
"loss": 1.1504, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.8723639773246538, |
|
"learning_rate": 3.59654869022294e-05, |
|
"loss": 1.2015, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.7314301031903592, |
|
"learning_rate": 3.5280202598886324e-05, |
|
"loss": 1.1912, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.78382821788559, |
|
"learning_rate": 3.4600108862796796e-05, |
|
"loss": 1.1531, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.746538778895017, |
|
"learning_rate": 3.392526023789349e-05, |
|
"loss": 1.1861, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.7335920604121904, |
|
"learning_rate": 3.325571084744803e-05, |
|
"loss": 1.1032, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.8181126090151531, |
|
"learning_rate": 3.259151438973024e-05, |
|
"loss": 1.2354, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.8813117253151651, |
|
"learning_rate": 3.1932724133701344e-05, |
|
"loss": 1.0998, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.7703731279590557, |
|
"learning_rate": 3.1279392914742046e-05, |
|
"loss": 1.1732, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.8022002883485505, |
|
"learning_rate": 3.06315731304148e-05, |
|
"loss": 1.2005, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.7718995592442958, |
|
"learning_rate": 2.998931673626175e-05, |
|
"loss": 1.0812, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.7753353561939175, |
|
"learning_rate": 2.935267524163774e-05, |
|
"loss": 1.2016, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.8453046609631811, |
|
"learning_rate": 2.872169970557913e-05, |
|
"loss": 1.2015, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.788689710824734, |
|
"learning_rate": 2.8096440732709083e-05, |
|
"loss": 1.2301, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.9498609267061067, |
|
"learning_rate": 2.7476948469178887e-05, |
|
"loss": 1.1721, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.2548603296587735, |
|
"learning_rate": 2.6863272598646106e-05, |
|
"loss": 1.1309, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.8088505633219847, |
|
"learning_rate": 2.625546233829016e-05, |
|
"loss": 1.1924, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.8053156607642844, |
|
"learning_rate": 2.5653566434864928e-05, |
|
"loss": 1.1771, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.0076517594688355, |
|
"learning_rate": 2.5057633160789184e-05, |
|
"loss": 1.1547, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.7807995790198267, |
|
"learning_rate": 2.446771031027527e-05, |
|
"loss": 1.2168, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.7811855005276076, |
|
"learning_rate": 2.3883845195495878e-05, |
|
"loss": 1.2407, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.7607446953536656, |
|
"learning_rate": 2.330608464278953e-05, |
|
"loss": 1.1777, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.8030643238657801, |
|
"learning_rate": 2.273447498890521e-05, |
|
"loss": 1.2321, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.707300837474353, |
|
"learning_rate": 2.2169062077286075e-05, |
|
"loss": 1.103, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.7398162416161335, |
|
"learning_rate": 2.1609891254392678e-05, |
|
"loss": 1.082, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.8829809189432961, |
|
"learning_rate": 2.1057007366066373e-05, |
|
"loss": 1.1863, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.9036595461566347, |
|
"learning_rate": 2.0510454753932395e-05, |
|
"loss": 1.1125, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.6793566054364535, |
|
"learning_rate": 1.9970277251843862e-05, |
|
"loss": 1.1453, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.7465371914976613, |
|
"learning_rate": 1.9436518182366158e-05, |
|
"loss": 1.1914, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.6678427111685153, |
|
"learning_rate": 1.8909220353302392e-05, |
|
"loss": 1.1865, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.8490303048462068, |
|
"learning_rate": 1.838842605426031e-05, |
|
"loss": 1.0754, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.6832719345179151, |
|
"learning_rate": 1.7874177053260598e-05, |
|
"loss": 1.0879, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.9649690559163364, |
|
"learning_rate": 1.736651459338695e-05, |
|
"loss": 1.0977, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.6985183059667567, |
|
"learning_rate": 1.6865479389478545e-05, |
|
"loss": 1.1476, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.6282090702067379, |
|
"learning_rate": 1.6371111624864543e-05, |
|
"loss": 1.1299, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.8512442207678547, |
|
"learning_rate": 1.5883450948141377e-05, |
|
"loss": 1.1309, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.7151617030884643, |
|
"learning_rate": 1.540253646999299e-05, |
|
"loss": 1.1936, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.7567039565297413, |
|
"learning_rate": 1.4928406760054059e-05, |
|
"loss": 1.1441, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.8464089358888328, |
|
"learning_rate": 1.4461099843816684e-05, |
|
"loss": 1.145, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.7170403140830546, |
|
"learning_rate": 1.4000653199580782e-05, |
|
"loss": 1.1887, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.7735313453026896, |
|
"learning_rate": 1.3547103755448287e-05, |
|
"loss": 1.1786, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.7081409924001252, |
|
"learning_rate": 1.3100487886361379e-05, |
|
"loss": 1.2186, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.6796669559142102, |
|
"learning_rate": 1.266084141118542e-05, |
|
"loss": 1.1473, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.7301679940106633, |
|
"learning_rate": 1.2228199589835999e-05, |
|
"loss": 1.1798, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.838436487459379, |
|
"learning_rate": 1.1802597120451286e-05, |
|
"loss": 1.1464, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.8915729376099731, |
|
"learning_rate": 1.1384068136609105e-05, |
|
"loss": 1.1965, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.6967288554874705, |
|
"learning_rate": 1.0972646204589377e-05, |
|
"loss": 1.1211, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.8038608019365008, |
|
"learning_rate": 1.0568364320682178e-05, |
|
"loss": 1.1961, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.8793813680766012, |
|
"learning_rate": 1.0171254908541372e-05, |
|
"loss": 1.0912, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.7353307810953187, |
|
"learning_rate": 9.781349816584162e-06, |
|
"loss": 1.1382, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.7264376523774766, |
|
"learning_rate": 9.398680315436903e-06, |
|
"loss": 1.1503, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.8294378899418743, |
|
"learning_rate": 9.023277095427173e-06, |
|
"loss": 1.1001, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.6847190426952564, |
|
"learning_rate": 8.655170264122303e-06, |
|
"loss": 1.1837, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.7123063489929077, |
|
"learning_rate": 8.294389343914899e-06, |
|
"loss": 1.1665, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.7161490463804174, |
|
"learning_rate": 7.940963269654922e-06, |
|
"loss": 1.2128, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.6942289082099549, |
|
"learning_rate": 7.594920386329252e-06, |
|
"loss": 1.1516, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.7000185077826192, |
|
"learning_rate": 7.256288446788362e-06, |
|
"loss": 1.0752, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.8141301323800174, |
|
"learning_rate": 6.925094609520455e-06, |
|
"loss": 1.1663, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.7801337376407085, |
|
"learning_rate": 6.601365436473439e-06, |
|
"loss": 1.1264, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.826763084695081, |
|
"learning_rate": 6.2851268909245865e-06, |
|
"loss": 1.1076, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.7778658882619575, |
|
"learning_rate": 5.976404335398256e-06, |
|
"loss": 1.185, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.5899787648756437, |
|
"learning_rate": 5.675222529631841e-06, |
|
"loss": 1.1383, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.7296161663517282, |
|
"learning_rate": 5.381605628590003e-06, |
|
"loss": 1.1406, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.7308791705621759, |
|
"learning_rate": 5.095577180527378e-06, |
|
"loss": 1.1641, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.7165596394167716, |
|
"learning_rate": 4.817160125100106e-06, |
|
"loss": 1.23, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.8126491682868108, |
|
"learning_rate": 4.546376791525975e-06, |
|
"loss": 1.1496, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.6916149503964037, |
|
"learning_rate": 4.2832488967935795e-06, |
|
"loss": 1.1613, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.6713280206828187, |
|
"learning_rate": 4.02779754392072e-06, |
|
"loss": 1.1778, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.8687927248580933, |
|
"learning_rate": 3.780043220261764e-06, |
|
"loss": 1.1947, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.9156585367622113, |
|
"learning_rate": 3.540005795864709e-06, |
|
"loss": 1.0788, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.8066844554106848, |
|
"learning_rate": 3.3077045218775192e-06, |
|
"loss": 1.2058, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.8260344643458865, |
|
"learning_rate": 3.0831580290041184e-06, |
|
"loss": 1.1859, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.6728735284202596, |
|
"learning_rate": 2.8663843260103074e-06, |
|
"loss": 1.1548, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.8464248337059433, |
|
"learning_rate": 2.6574007982793857e-06, |
|
"loss": 1.1424, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.7582737017596137, |
|
"learning_rate": 2.456224206417812e-06, |
|
"loss": 1.1132, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.7570508766628901, |
|
"learning_rate": 2.262870684911045e-06, |
|
"loss": 1.1916, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.6948287618722808, |
|
"learning_rate": 2.0773557408295343e-06, |
|
"loss": 1.1384, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.7078737172323022, |
|
"learning_rate": 1.8996942525850047e-06, |
|
"loss": 1.1501, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.767530275514686, |
|
"learning_rate": 1.7299004687372665e-06, |
|
"loss": 1.1267, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.7858189878444168, |
|
"learning_rate": 1.5679880068514174e-06, |
|
"loss": 1.1056, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.8212356253611762, |
|
"learning_rate": 1.4139698524057165e-06, |
|
"loss": 1.1939, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.7961839613413594, |
|
"learning_rate": 1.2678583577501624e-06, |
|
"loss": 1.1662, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.850673264738278, |
|
"learning_rate": 1.1296652411158182e-06, |
|
"loss": 1.17, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.7063089618156579, |
|
"learning_rate": 9.994015856749527e-07, |
|
"loss": 1.1253, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.7573170758624328, |
|
"learning_rate": 8.770778386522627e-07, |
|
"loss": 1.0771, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.8837077908686763, |
|
"learning_rate": 7.627038104869199e-07, |
|
"loss": 1.1685, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.7524180298657765, |
|
"learning_rate": 6.562886740457797e-07, |
|
"loss": 1.1406, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.7266831937495886, |
|
"learning_rate": 5.578409638877457e-07, |
|
"loss": 1.0525, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.8486209379295809, |
|
"learning_rate": 4.6736857557925227e-07, |
|
"loss": 1.1348, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.7660696151853608, |
|
"learning_rate": 3.8487876506106966e-07, |
|
"loss": 1.2252, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.7491200820319253, |
|
"learning_rate": 3.1037814806634815e-07, |
|
"loss": 1.1097, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.7181576551220783, |
|
"learning_rate": 2.43872699590042e-07, |
|
"loss": 1.0703, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.7330097823119024, |
|
"learning_rate": 1.8536775340970425e-07, |
|
"loss": 1.1908, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.8782265197206924, |
|
"learning_rate": 1.348680016577397e-07, |
|
"loss": 1.1906, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.7862482683388468, |
|
"learning_rate": 9.237749444505062e-08, |
|
"loss": 1.1362, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.7669692568908726, |
|
"learning_rate": 5.7899639536251883e-08, |
|
"loss": 1.1689, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.8047883869461449, |
|
"learning_rate": 3.143720207635648e-08, |
|
"loss": 1.1729, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.9030824345194336, |
|
"learning_rate": 1.299230436898613e-08, |
|
"loss": 1.196, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.7879987601052318, |
|
"learning_rate": 2.566425706218567e-09, |
|
"loss": 1.0957, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": NaN, |
|
"eval_runtime": 1989.8167, |
|
"eval_samples_per_second": 3.484, |
|
"eval_steps_per_second": 0.871, |
|
"step": 1949 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 1949, |
|
"total_flos": 1.2135540828143616e+16, |
|
"train_loss": 1.2706995834871218, |
|
"train_runtime": 18075.5808, |
|
"train_samples_per_second": 3.45, |
|
"train_steps_per_second": 0.108 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1949, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"total_flos": 1.2135540828143616e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|