|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 200, |
|
"global_step": 3348, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0002986857825567503, |
|
"grad_norm": 0.3188975789150384, |
|
"learning_rate": 5.970149253731343e-07, |
|
"loss": 0.2045, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0014934289127837516, |
|
"grad_norm": 0.5466095149619057, |
|
"learning_rate": 2.9850746268656716e-06, |
|
"loss": 0.5187, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.002986857825567503, |
|
"grad_norm": 0.46728261805758203, |
|
"learning_rate": 5.970149253731343e-06, |
|
"loss": 0.4648, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.004480286738351254, |
|
"grad_norm": 0.6171065397357978, |
|
"learning_rate": 8.955223880597016e-06, |
|
"loss": 0.475, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.005973715651135006, |
|
"grad_norm": 0.49334289479515664, |
|
"learning_rate": 1.1940298507462686e-05, |
|
"loss": 0.4602, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.007467144563918757, |
|
"grad_norm": 0.5174483814171045, |
|
"learning_rate": 1.4925373134328357e-05, |
|
"loss": 0.4816, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.008960573476702509, |
|
"grad_norm": 0.4471748736139888, |
|
"learning_rate": 1.791044776119403e-05, |
|
"loss": 0.3534, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.01045400238948626, |
|
"grad_norm": 0.5227128696793767, |
|
"learning_rate": 2.0895522388059702e-05, |
|
"loss": 0.455, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.011947431302270013, |
|
"grad_norm": 0.57335506943901, |
|
"learning_rate": 2.3880597014925373e-05, |
|
"loss": 0.4381, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.013440860215053764, |
|
"grad_norm": 0.5672666580878948, |
|
"learning_rate": 2.6865671641791047e-05, |
|
"loss": 0.4183, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.014934289127837515, |
|
"grad_norm": 0.615033725558188, |
|
"learning_rate": 2.9850746268656714e-05, |
|
"loss": 0.4493, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.016427718040621268, |
|
"grad_norm": 0.5694295537449918, |
|
"learning_rate": 3.283582089552239e-05, |
|
"loss": 0.3214, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.017921146953405017, |
|
"grad_norm": 0.4317314011547438, |
|
"learning_rate": 3.582089552238806e-05, |
|
"loss": 0.3697, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.01941457586618877, |
|
"grad_norm": 0.5686093679349059, |
|
"learning_rate": 3.8805970149253736e-05, |
|
"loss": 0.3266, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.02090800477897252, |
|
"grad_norm": 0.5271452706616764, |
|
"learning_rate": 4.1791044776119404e-05, |
|
"loss": 0.365, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.022401433691756272, |
|
"grad_norm": 0.7400114199406753, |
|
"learning_rate": 4.477611940298508e-05, |
|
"loss": 0.4381, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.023894862604540025, |
|
"grad_norm": 0.5987079894007333, |
|
"learning_rate": 4.7761194029850745e-05, |
|
"loss": 0.3233, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.025388291517323774, |
|
"grad_norm": 0.5231052349056124, |
|
"learning_rate": 5.074626865671642e-05, |
|
"loss": 0.3543, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.026881720430107527, |
|
"grad_norm": 0.6540900409905596, |
|
"learning_rate": 5.373134328358209e-05, |
|
"loss": 0.3387, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.028375149342891277, |
|
"grad_norm": 0.5922650571689663, |
|
"learning_rate": 5.671641791044776e-05, |
|
"loss": 0.3648, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.02986857825567503, |
|
"grad_norm": 0.671441503007203, |
|
"learning_rate": 5.970149253731343e-05, |
|
"loss": 0.3842, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.03136200716845878, |
|
"grad_norm": 0.5589374162458473, |
|
"learning_rate": 6.268656716417911e-05, |
|
"loss": 0.3074, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.032855436081242535, |
|
"grad_norm": 0.5380602393505818, |
|
"learning_rate": 6.567164179104478e-05, |
|
"loss": 0.3507, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.034348864994026285, |
|
"grad_norm": 0.6063877341870142, |
|
"learning_rate": 6.865671641791044e-05, |
|
"loss": 0.3401, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.035842293906810034, |
|
"grad_norm": 0.5260512769152438, |
|
"learning_rate": 7.164179104477612e-05, |
|
"loss": 0.3724, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.03733572281959379, |
|
"grad_norm": 0.5959345681808844, |
|
"learning_rate": 7.46268656716418e-05, |
|
"loss": 0.3713, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.03882915173237754, |
|
"grad_norm": 0.5141504485060038, |
|
"learning_rate": 7.761194029850747e-05, |
|
"loss": 0.306, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.04032258064516129, |
|
"grad_norm": 0.45162013987431543, |
|
"learning_rate": 8.059701492537314e-05, |
|
"loss": 0.3279, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.04181600955794504, |
|
"grad_norm": 0.5664477964459901, |
|
"learning_rate": 8.358208955223881e-05, |
|
"loss": 0.2853, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.043309438470728795, |
|
"grad_norm": 0.4935236502406577, |
|
"learning_rate": 8.656716417910447e-05, |
|
"loss": 0.3515, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.044802867383512544, |
|
"grad_norm": 0.49883746692891684, |
|
"learning_rate": 8.955223880597016e-05, |
|
"loss": 0.3112, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.046296296296296294, |
|
"grad_norm": 0.5370459530188539, |
|
"learning_rate": 9.253731343283582e-05, |
|
"loss": 0.2918, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.04778972520908005, |
|
"grad_norm": 0.5147364002910404, |
|
"learning_rate": 9.552238805970149e-05, |
|
"loss": 0.3427, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.0492831541218638, |
|
"grad_norm": 0.48526830628523326, |
|
"learning_rate": 9.850746268656717e-05, |
|
"loss": 0.3272, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.05077658303464755, |
|
"grad_norm": 0.44536442368732077, |
|
"learning_rate": 0.00010149253731343284, |
|
"loss": 0.3216, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.052270011947431305, |
|
"grad_norm": 0.6529775264003637, |
|
"learning_rate": 0.0001044776119402985, |
|
"loss": 0.3343, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.053763440860215055, |
|
"grad_norm": 0.500538842360273, |
|
"learning_rate": 0.00010746268656716419, |
|
"loss": 0.2882, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.055256869772998804, |
|
"grad_norm": 0.45720922405183984, |
|
"learning_rate": 0.00011044776119402987, |
|
"loss": 0.3565, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.05675029868578255, |
|
"grad_norm": 0.395572707884918, |
|
"learning_rate": 0.00011343283582089552, |
|
"loss": 0.2879, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.05824372759856631, |
|
"grad_norm": 0.38317766271575404, |
|
"learning_rate": 0.0001164179104477612, |
|
"loss": 0.326, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.05973715651135006, |
|
"grad_norm": 0.5636861131014499, |
|
"learning_rate": 0.00011940298507462686, |
|
"loss": 0.3403, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.05973715651135006, |
|
"eval_loss": 0.3074450194835663, |
|
"eval_runtime": 675.2136, |
|
"eval_samples_per_second": 5.924, |
|
"eval_steps_per_second": 0.741, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.06123058542413381, |
|
"grad_norm": 0.5285903241151306, |
|
"learning_rate": 0.00012238805970149255, |
|
"loss": 0.3054, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.06272401433691756, |
|
"grad_norm": 0.462285447656946, |
|
"learning_rate": 0.00012537313432835822, |
|
"loss": 0.3289, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.06421744324970131, |
|
"grad_norm": 0.41949653076343973, |
|
"learning_rate": 0.00012835820895522389, |
|
"loss": 0.3235, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.06571087216248507, |
|
"grad_norm": 0.37413394493320784, |
|
"learning_rate": 0.00013134328358208955, |
|
"loss": 0.3284, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.06720430107526881, |
|
"grad_norm": 0.43599604159082916, |
|
"learning_rate": 0.00013432835820895525, |
|
"loss": 0.3509, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.06869772998805257, |
|
"grad_norm": 0.45911550206829044, |
|
"learning_rate": 0.0001373134328358209, |
|
"loss": 0.2856, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.07019115890083633, |
|
"grad_norm": 0.6197373846263158, |
|
"learning_rate": 0.00014029850746268658, |
|
"loss": 0.3381, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.07168458781362007, |
|
"grad_norm": 0.5162001410157283, |
|
"learning_rate": 0.00014328358208955225, |
|
"loss": 0.2987, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.07317801672640382, |
|
"grad_norm": 0.3756162304697717, |
|
"learning_rate": 0.00014626865671641792, |
|
"loss": 0.3016, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.07467144563918758, |
|
"grad_norm": 0.4427925525549972, |
|
"learning_rate": 0.0001492537313432836, |
|
"loss": 0.3301, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.07616487455197132, |
|
"grad_norm": 0.4587549108919026, |
|
"learning_rate": 0.00015223880597014925, |
|
"loss": 0.2864, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.07765830346475508, |
|
"grad_norm": 0.4539343579685493, |
|
"learning_rate": 0.00015522388059701495, |
|
"loss": 0.3263, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.07915173237753884, |
|
"grad_norm": 0.4601786532582476, |
|
"learning_rate": 0.00015820895522388059, |
|
"loss": 0.3257, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.08064516129032258, |
|
"grad_norm": 0.4952572370631647, |
|
"learning_rate": 0.00016119402985074628, |
|
"loss": 0.3227, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.08213859020310633, |
|
"grad_norm": 0.5594600091045848, |
|
"learning_rate": 0.00016417910447761195, |
|
"loss": 0.3662, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.08363201911589008, |
|
"grad_norm": 0.4150317719098373, |
|
"learning_rate": 0.00016716417910447761, |
|
"loss": 0.251, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.08512544802867383, |
|
"grad_norm": 0.45530314509050923, |
|
"learning_rate": 0.00017014925373134328, |
|
"loss": 0.3277, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.08661887694145759, |
|
"grad_norm": 0.3707658637725497, |
|
"learning_rate": 0.00017313432835820895, |
|
"loss": 0.2958, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.08811230585424133, |
|
"grad_norm": 0.3532362466068546, |
|
"learning_rate": 0.00017611940298507464, |
|
"loss": 0.3275, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.08960573476702509, |
|
"grad_norm": 0.41975328143934704, |
|
"learning_rate": 0.0001791044776119403, |
|
"loss": 0.361, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.09109916367980884, |
|
"grad_norm": 0.3889061650864088, |
|
"learning_rate": 0.00018208955223880598, |
|
"loss": 0.2696, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.09259259259259259, |
|
"grad_norm": 0.46543359027880427, |
|
"learning_rate": 0.00018507462686567165, |
|
"loss": 0.3443, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.09408602150537634, |
|
"grad_norm": 0.5714057611200172, |
|
"learning_rate": 0.00018805970149253734, |
|
"loss": 0.3374, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.0955794504181601, |
|
"grad_norm": 0.3948901958880527, |
|
"learning_rate": 0.00019104477611940298, |
|
"loss": 0.321, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.09707287933094384, |
|
"grad_norm": 0.5540915005920514, |
|
"learning_rate": 0.00019402985074626867, |
|
"loss": 0.3277, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.0985663082437276, |
|
"grad_norm": 0.48042615725477694, |
|
"learning_rate": 0.00019701492537313434, |
|
"loss": 0.2833, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.10005973715651136, |
|
"grad_norm": 0.35451829573422367, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2882, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.1015531660692951, |
|
"grad_norm": 0.37754764974968846, |
|
"learning_rate": 0.00019999864102799163, |
|
"loss": 0.2981, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.10304659498207885, |
|
"grad_norm": 0.41754484552953547, |
|
"learning_rate": 0.0001999945641489025, |
|
"loss": 0.3385, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.10454002389486261, |
|
"grad_norm": 0.48242379028499566, |
|
"learning_rate": 0.00019998776947353995, |
|
"loss": 0.3462, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.10603345280764635, |
|
"grad_norm": 0.43615681396287914, |
|
"learning_rate": 0.00019997825718657945, |
|
"loss": 0.2829, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.10752688172043011, |
|
"grad_norm": 0.5382667372932559, |
|
"learning_rate": 0.0001999660275465596, |
|
"loss": 0.3086, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.10902031063321387, |
|
"grad_norm": 0.4221095850996317, |
|
"learning_rate": 0.00019995108088587528, |
|
"loss": 0.2853, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.11051373954599761, |
|
"grad_norm": 0.4049511576507522, |
|
"learning_rate": 0.00019993341761076824, |
|
"loss": 0.3144, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.11200716845878136, |
|
"grad_norm": 0.41672237725910044, |
|
"learning_rate": 0.00019991303820131645, |
|
"loss": 0.3262, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.1135005973715651, |
|
"grad_norm": 0.37665369388894876, |
|
"learning_rate": 0.00019988994321142088, |
|
"loss": 0.2746, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.11499402628434886, |
|
"grad_norm": 0.38890047129761735, |
|
"learning_rate": 0.00019986413326879035, |
|
"loss": 0.3194, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.11648745519713262, |
|
"grad_norm": 0.37534439629256694, |
|
"learning_rate": 0.00019983560907492476, |
|
"loss": 0.3134, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.11798088410991636, |
|
"grad_norm": 0.3401949072629029, |
|
"learning_rate": 0.00019980437140509563, |
|
"loss": 0.3198, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.11947431302270012, |
|
"grad_norm": 0.4404312617632526, |
|
"learning_rate": 0.00019977042110832537, |
|
"loss": 0.3224, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.11947431302270012, |
|
"eval_loss": 0.29538044333457947, |
|
"eval_runtime": 671.0671, |
|
"eval_samples_per_second": 5.961, |
|
"eval_steps_per_second": 0.745, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.12096774193548387, |
|
"grad_norm": 0.3877778527340937, |
|
"learning_rate": 0.00019973375910736408, |
|
"loss": 0.2605, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.12246117084826762, |
|
"grad_norm": 0.42449178944770155, |
|
"learning_rate": 0.00019969438639866436, |
|
"loss": 0.3292, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.12395459976105137, |
|
"grad_norm": 0.4421146156020614, |
|
"learning_rate": 0.00019965230405235443, |
|
"loss": 0.2847, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.12544802867383512, |
|
"grad_norm": 0.33313138965230243, |
|
"learning_rate": 0.00019960751321220887, |
|
"loss": 0.3346, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.12694145758661887, |
|
"grad_norm": 0.35768831528441764, |
|
"learning_rate": 0.0001995600150956177, |
|
"loss": 0.2896, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.12843488649940263, |
|
"grad_norm": 0.4073503627739374, |
|
"learning_rate": 0.0001995098109935531, |
|
"loss": 0.2963, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.12992831541218638, |
|
"grad_norm": 0.4634204409239979, |
|
"learning_rate": 0.00019945690227053445, |
|
"loss": 0.309, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.13142174432497014, |
|
"grad_norm": 0.48789887803899984, |
|
"learning_rate": 0.00019940129036459121, |
|
"loss": 0.2778, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.13291517323775387, |
|
"grad_norm": 0.41135714129715417, |
|
"learning_rate": 0.0001993429767872239, |
|
"loss": 0.312, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.13440860215053763, |
|
"grad_norm": 0.48911130889740795, |
|
"learning_rate": 0.00019928196312336285, |
|
"loss": 0.354, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.13590203106332138, |
|
"grad_norm": 0.31087210214054994, |
|
"learning_rate": 0.00019921825103132531, |
|
"loss": 0.2581, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.13739545997610514, |
|
"grad_norm": 0.34319695770593983, |
|
"learning_rate": 0.00019915184224277032, |
|
"loss": 0.2861, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.1388888888888889, |
|
"grad_norm": 0.39072017479958165, |
|
"learning_rate": 0.00019908273856265152, |
|
"loss": 0.28, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.14038231780167265, |
|
"grad_norm": 0.38108092212692735, |
|
"learning_rate": 0.00019901094186916825, |
|
"loss": 0.2938, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.14187574671445638, |
|
"grad_norm": 0.40284399729512566, |
|
"learning_rate": 0.00019893645411371447, |
|
"loss": 0.3232, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.14336917562724014, |
|
"grad_norm": 0.41081349055067273, |
|
"learning_rate": 0.00019885927732082563, |
|
"loss": 0.2682, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.1448626045400239, |
|
"grad_norm": 0.35453420174268324, |
|
"learning_rate": 0.00019877941358812382, |
|
"loss": 0.3099, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.14635603345280765, |
|
"grad_norm": 0.4219522172174877, |
|
"learning_rate": 0.00019869686508626054, |
|
"loss": 0.2929, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.1478494623655914, |
|
"grad_norm": 0.433146389991006, |
|
"learning_rate": 0.00019861163405885787, |
|
"loss": 0.3255, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.14934289127837516, |
|
"grad_norm": 0.43636524610968963, |
|
"learning_rate": 0.0001985237228224474, |
|
"loss": 0.3251, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.1508363201911589, |
|
"grad_norm": 0.39903987473575914, |
|
"learning_rate": 0.00019843313376640732, |
|
"loss": 0.2786, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.15232974910394265, |
|
"grad_norm": 0.42757498238366765, |
|
"learning_rate": 0.0001983398693528975, |
|
"loss": 0.3365, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.1538231780167264, |
|
"grad_norm": 0.4030570437823937, |
|
"learning_rate": 0.00019824393211679246, |
|
"loss": 0.2501, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.15531660692951016, |
|
"grad_norm": 0.3624497193987114, |
|
"learning_rate": 0.00019814532466561259, |
|
"loss": 0.2987, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.15681003584229392, |
|
"grad_norm": 0.4694503509924643, |
|
"learning_rate": 0.00019804404967945315, |
|
"loss": 0.3244, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.15830346475507767, |
|
"grad_norm": 0.43327424018713545, |
|
"learning_rate": 0.00019794010991091164, |
|
"loss": 0.2656, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.1597968936678614, |
|
"grad_norm": 0.36124038110578643, |
|
"learning_rate": 0.00019783350818501272, |
|
"loss": 0.2949, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.16129032258064516, |
|
"grad_norm": 0.4813284517384988, |
|
"learning_rate": 0.00019772424739913168, |
|
"loss": 0.2527, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.1627837514934289, |
|
"grad_norm": 0.517676779630334, |
|
"learning_rate": 0.00019761233052291544, |
|
"loss": 0.3322, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.16427718040621267, |
|
"grad_norm": 0.41586609149970155, |
|
"learning_rate": 0.0001974977605982021, |
|
"loss": 0.3058, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.16577060931899643, |
|
"grad_norm": 0.38142158790681063, |
|
"learning_rate": 0.00019738054073893807, |
|
"loss": 0.2617, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.16726403823178015, |
|
"grad_norm": 0.4508850152729104, |
|
"learning_rate": 0.00019726067413109347, |
|
"loss": 0.3124, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.1687574671445639, |
|
"grad_norm": 0.38711879954880984, |
|
"learning_rate": 0.0001971381640325756, |
|
"loss": 0.2886, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.17025089605734767, |
|
"grad_norm": 0.3713159311530026, |
|
"learning_rate": 0.00019701301377314038, |
|
"loss": 0.2884, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.17174432497013142, |
|
"grad_norm": 0.4253789489790291, |
|
"learning_rate": 0.0001968852267543018, |
|
"loss": 0.3403, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.17323775388291518, |
|
"grad_norm": 0.3716154947554444, |
|
"learning_rate": 0.00019675480644923944, |
|
"loss": 0.2563, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.17473118279569894, |
|
"grad_norm": 0.4067400864065092, |
|
"learning_rate": 0.00019662175640270424, |
|
"loss": 0.3139, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.17622461170848266, |
|
"grad_norm": 0.3587456607358574, |
|
"learning_rate": 0.00019648608023092195, |
|
"loss": 0.2851, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.17771804062126642, |
|
"grad_norm": 0.35869894756290216, |
|
"learning_rate": 0.00019634778162149497, |
|
"loss": 0.299, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.17921146953405018, |
|
"grad_norm": 0.4291965063428789, |
|
"learning_rate": 0.00019620686433330207, |
|
"loss": 0.3055, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.17921146953405018, |
|
"eval_loss": 0.28864020109176636, |
|
"eval_runtime": 670.6057, |
|
"eval_samples_per_second": 5.965, |
|
"eval_steps_per_second": 0.746, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.18070489844683393, |
|
"grad_norm": 0.4107636120154602, |
|
"learning_rate": 0.00019606333219639624, |
|
"loss": 0.2674, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.1821983273596177, |
|
"grad_norm": 0.4094821475481708, |
|
"learning_rate": 0.00019591718911190066, |
|
"loss": 0.2748, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.18369175627240145, |
|
"grad_norm": 0.42712744392327107, |
|
"learning_rate": 0.00019576843905190253, |
|
"loss": 0.2913, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.18518518518518517, |
|
"grad_norm": 0.3838363588569511, |
|
"learning_rate": 0.00019561708605934515, |
|
"loss": 0.3019, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.18667861409796893, |
|
"grad_norm": 0.4060361946228662, |
|
"learning_rate": 0.0001954631342479182, |
|
"loss": 0.3028, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.1881720430107527, |
|
"grad_norm": 0.36895107070060834, |
|
"learning_rate": 0.0001953065878019457, |
|
"loss": 0.2912, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.18966547192353644, |
|
"grad_norm": 0.5389015474723242, |
|
"learning_rate": 0.0001951474509762724, |
|
"loss": 0.3277, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.1911589008363202, |
|
"grad_norm": 0.3713681816986798, |
|
"learning_rate": 0.0001949857280961481, |
|
"loss": 0.2564, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.19265232974910393, |
|
"grad_norm": 0.39787955963512267, |
|
"learning_rate": 0.00019482142355711023, |
|
"loss": 0.3167, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.19414575866188769, |
|
"grad_norm": 0.4160144539607265, |
|
"learning_rate": 0.0001946545418248641, |
|
"loss": 0.3473, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.19563918757467144, |
|
"grad_norm": 0.4611729979684992, |
|
"learning_rate": 0.00019448508743516186, |
|
"loss": 0.2579, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.1971326164874552, |
|
"grad_norm": 0.37926676547852983, |
|
"learning_rate": 0.00019431306499367886, |
|
"loss": 0.2995, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.19862604540023895, |
|
"grad_norm": 0.41885156529130685, |
|
"learning_rate": 0.00019413847917588878, |
|
"loss": 0.3042, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.2001194743130227, |
|
"grad_norm": 0.427624169132239, |
|
"learning_rate": 0.00019396133472693642, |
|
"loss": 0.3173, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.20161290322580644, |
|
"grad_norm": 0.38475710691405496, |
|
"learning_rate": 0.00019378163646150876, |
|
"loss": 0.3021, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.2031063321385902, |
|
"grad_norm": 0.4138321670681785, |
|
"learning_rate": 0.000193599389263704, |
|
"loss": 0.274, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.20459976105137395, |
|
"grad_norm": 0.355884010075492, |
|
"learning_rate": 0.00019341459808689898, |
|
"loss": 0.3393, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.2060931899641577, |
|
"grad_norm": 0.3631472347986104, |
|
"learning_rate": 0.00019322726795361443, |
|
"loss": 0.2949, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.20758661887694146, |
|
"grad_norm": 0.4773277681201716, |
|
"learning_rate": 0.0001930374039553785, |
|
"loss": 0.301, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.20908004778972522, |
|
"grad_norm": 0.448239132237338, |
|
"learning_rate": 0.00019284501125258835, |
|
"loss": 0.3332, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.21057347670250895, |
|
"grad_norm": 0.38679234194418, |
|
"learning_rate": 0.00019265009507436997, |
|
"loss": 0.2821, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.2120669056152927, |
|
"grad_norm": 0.34050133768678587, |
|
"learning_rate": 0.00019245266071843596, |
|
"loss": 0.2971, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.21356033452807646, |
|
"grad_norm": 0.377904449126756, |
|
"learning_rate": 0.00019225271355094155, |
|
"loss": 0.2644, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.21505376344086022, |
|
"grad_norm": 0.43091731111413145, |
|
"learning_rate": 0.00019205025900633884, |
|
"loss": 0.3028, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.21654719235364397, |
|
"grad_norm": 0.35823568635784103, |
|
"learning_rate": 0.00019184530258722899, |
|
"loss": 0.2941, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.21804062126642773, |
|
"grad_norm": 0.3871906521592633, |
|
"learning_rate": 0.00019163784986421276, |
|
"loss": 0.2453, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.21953405017921146, |
|
"grad_norm": 0.3870097665982384, |
|
"learning_rate": 0.00019142790647573902, |
|
"loss": 0.3163, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.22102747909199522, |
|
"grad_norm": 0.4043861350049887, |
|
"learning_rate": 0.00019121547812795152, |
|
"loss": 0.2867, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.22252090800477897, |
|
"grad_norm": 0.3472296001968241, |
|
"learning_rate": 0.00019100057059453381, |
|
"loss": 0.308, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.22401433691756273, |
|
"grad_norm": 0.4555451484017787, |
|
"learning_rate": 0.00019078318971655237, |
|
"loss": 0.3094, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.22550776583034648, |
|
"grad_norm": 0.39417763113385473, |
|
"learning_rate": 0.00019056334140229777, |
|
"loss": 0.2673, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.2270011947431302, |
|
"grad_norm": 0.33903321452810364, |
|
"learning_rate": 0.00019034103162712408, |
|
"loss": 0.318, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.22849462365591397, |
|
"grad_norm": 0.37450897393444954, |
|
"learning_rate": 0.0001901162664332866, |
|
"loss": 0.2543, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.22998805256869773, |
|
"grad_norm": 0.3798157275859589, |
|
"learning_rate": 0.0001898890519297774, |
|
"loss": 0.2945, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.23148148148148148, |
|
"grad_norm": 0.3682423785435909, |
|
"learning_rate": 0.00018965939429215948, |
|
"loss": 0.3127, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.23297491039426524, |
|
"grad_norm": 0.3771090235525355, |
|
"learning_rate": 0.0001894272997623989, |
|
"loss": 0.2722, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.234468339307049, |
|
"grad_norm": 0.33941544966890785, |
|
"learning_rate": 0.00018919277464869504, |
|
"loss": 0.3163, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.23596176821983272, |
|
"grad_norm": 0.37376605453569783, |
|
"learning_rate": 0.0001889558253253092, |
|
"loss": 0.2598, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.23745519713261648, |
|
"grad_norm": 0.38959423980866825, |
|
"learning_rate": 0.00018871645823239128, |
|
"loss": 0.2983, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.23894862604540024, |
|
"grad_norm": 0.38139963613722794, |
|
"learning_rate": 0.00018847467987580493, |
|
"loss": 0.2899, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.23894862604540024, |
|
"eval_loss": 0.2804073989391327, |
|
"eval_runtime": 670.1656, |
|
"eval_samples_per_second": 5.969, |
|
"eval_steps_per_second": 0.746, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.240442054958184, |
|
"grad_norm": 0.40419199411712203, |
|
"learning_rate": 0.00018823049682695052, |
|
"loss": 0.3008, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.24193548387096775, |
|
"grad_norm": 0.3612377971129905, |
|
"learning_rate": 0.0001879839157225866, |
|
"loss": 0.316, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.2434289127837515, |
|
"grad_norm": 0.4605084068043848, |
|
"learning_rate": 0.0001877349432646495, |
|
"loss": 0.2776, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.24492234169653523, |
|
"grad_norm": 0.39686115392847526, |
|
"learning_rate": 0.0001874835862200713, |
|
"loss": 0.2664, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.246415770609319, |
|
"grad_norm": 0.38181276219849275, |
|
"learning_rate": 0.00018722985142059572, |
|
"loss": 0.2896, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.24790919952210275, |
|
"grad_norm": 0.36529483511621264, |
|
"learning_rate": 0.0001869737457625926, |
|
"loss": 0.2939, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.2494026284348865, |
|
"grad_norm": 0.4035594565736224, |
|
"learning_rate": 0.00018671527620687034, |
|
"loss": 0.3102, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.25089605734767023, |
|
"grad_norm": 0.3854541007588229, |
|
"learning_rate": 0.00018645444977848677, |
|
"loss": 0.2769, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.252389486260454, |
|
"grad_norm": 0.42123392936346155, |
|
"learning_rate": 0.00018619127356655813, |
|
"loss": 0.308, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.25388291517323774, |
|
"grad_norm": 0.49950892415773196, |
|
"learning_rate": 0.0001859257547240666, |
|
"loss": 0.314, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.2553763440860215, |
|
"grad_norm": 0.40906372202780966, |
|
"learning_rate": 0.00018565790046766564, |
|
"loss": 0.2881, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.25686977299880526, |
|
"grad_norm": 0.38764669514338385, |
|
"learning_rate": 0.000185387718077484, |
|
"loss": 0.3027, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.258363201911589, |
|
"grad_norm": 0.42262538628601043, |
|
"learning_rate": 0.00018511521489692775, |
|
"loss": 0.2545, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.25985663082437277, |
|
"grad_norm": 0.41209507390817657, |
|
"learning_rate": 0.00018484039833248085, |
|
"loss": 0.2828, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.2613500597371565, |
|
"grad_norm": 0.4469642279365962, |
|
"learning_rate": 0.0001845632758535036, |
|
"loss": 0.332, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.2628434886499403, |
|
"grad_norm": 0.36473061276227664, |
|
"learning_rate": 0.00018428385499202988, |
|
"loss": 0.2647, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.26433691756272404, |
|
"grad_norm": 0.4433634956798964, |
|
"learning_rate": 0.00018400214334256227, |
|
"loss": 0.2952, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.26583034647550774, |
|
"grad_norm": 0.3494167236815255, |
|
"learning_rate": 0.00018371814856186572, |
|
"loss": 0.2825, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.2673237753882915, |
|
"grad_norm": 0.38005445773872815, |
|
"learning_rate": 0.00018343187836875928, |
|
"loss": 0.3096, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.26881720430107525, |
|
"grad_norm": 0.5133589136001121, |
|
"learning_rate": 0.00018314334054390664, |
|
"loss": 0.3078, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.270310633213859, |
|
"grad_norm": 0.37745450032232525, |
|
"learning_rate": 0.00018285254292960433, |
|
"loss": 0.2687, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.27180406212664276, |
|
"grad_norm": 0.40501108505960953, |
|
"learning_rate": 0.00018255949342956863, |
|
"loss": 0.2969, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.2732974910394265, |
|
"grad_norm": 0.4103922981810604, |
|
"learning_rate": 0.000182264200008721, |
|
"loss": 0.2635, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.2747909199522103, |
|
"grad_norm": 0.8269261010911084, |
|
"learning_rate": 0.00018196667069297123, |
|
"loss": 0.2894, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.27628434886499403, |
|
"grad_norm": 0.5797869922585875, |
|
"learning_rate": 0.0001816669135689996, |
|
"loss": 0.3175, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.2777777777777778, |
|
"grad_norm": 0.3960727461959874, |
|
"learning_rate": 0.00018136493678403686, |
|
"loss": 0.2122, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.27927120669056155, |
|
"grad_norm": 0.3281979427197208, |
|
"learning_rate": 0.00018106074854564306, |
|
"loss": 0.2772, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.2807646356033453, |
|
"grad_norm": 0.3180009593314954, |
|
"learning_rate": 0.00018075435712148417, |
|
"loss": 0.2494, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.28225806451612906, |
|
"grad_norm": 0.5630380699907694, |
|
"learning_rate": 0.00018044577083910758, |
|
"loss": 0.287, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.28375149342891276, |
|
"grad_norm": 0.5821492853630149, |
|
"learning_rate": 0.00018013499808571567, |
|
"loss": 0.3064, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.2852449223416965, |
|
"grad_norm": 0.34343056721676674, |
|
"learning_rate": 0.00017982204730793795, |
|
"loss": 0.2479, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.2867383512544803, |
|
"grad_norm": 0.4045824911297299, |
|
"learning_rate": 0.0001795069270116013, |
|
"loss": 0.3042, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.28823178016726403, |
|
"grad_norm": 0.4423077224611283, |
|
"learning_rate": 0.000179189645761499, |
|
"loss": 0.2587, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.2897252090800478, |
|
"grad_norm": 0.3881310780181844, |
|
"learning_rate": 0.00017887021218115782, |
|
"loss": 0.2883, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.29121863799283154, |
|
"grad_norm": 0.4157883620231867, |
|
"learning_rate": 0.00017854863495260354, |
|
"loss": 0.3163, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.2927120669056153, |
|
"grad_norm": 0.5286260329472577, |
|
"learning_rate": 0.00017822492281612532, |
|
"loss": 0.2784, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.29420549581839905, |
|
"grad_norm": 0.43556243885834467, |
|
"learning_rate": 0.00017789908457003777, |
|
"loss": 0.2934, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.2956989247311828, |
|
"grad_norm": 0.46879974456717627, |
|
"learning_rate": 0.000177571129070442, |
|
"loss": 0.273, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.29719235364396657, |
|
"grad_norm": 0.3913970910916109, |
|
"learning_rate": 0.00017724106523098486, |
|
"loss": 0.2864, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.2986857825567503, |
|
"grad_norm": 0.5166170033604143, |
|
"learning_rate": 0.00017690890202261676, |
|
"loss": 0.3116, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.2986857825567503, |
|
"eval_loss": 0.27715662121772766, |
|
"eval_runtime": 670.7329, |
|
"eval_samples_per_second": 5.964, |
|
"eval_steps_per_second": 0.745, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.300179211469534, |
|
"grad_norm": 0.49927259244291966, |
|
"learning_rate": 0.00017657464847334775, |
|
"loss": 0.2295, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.3016726403823178, |
|
"grad_norm": 0.4015981411197979, |
|
"learning_rate": 0.0001762383136680022, |
|
"loss": 0.2807, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.30316606929510154, |
|
"grad_norm": 0.33536145313741844, |
|
"learning_rate": 0.00017589990674797171, |
|
"loss": 0.2683, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.3046594982078853, |
|
"grad_norm": 0.4291404869332869, |
|
"learning_rate": 0.00017555943691096706, |
|
"loss": 0.2704, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.30615292712066905, |
|
"grad_norm": 0.47290922277355507, |
|
"learning_rate": 0.00017521691341076774, |
|
"loss": 0.3136, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.3076463560334528, |
|
"grad_norm": 0.4688378459680159, |
|
"learning_rate": 0.00017487234555697072, |
|
"loss": 0.272, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.30913978494623656, |
|
"grad_norm": 0.36206038691433684, |
|
"learning_rate": 0.0001745257427147374, |
|
"loss": 0.2949, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.3106332138590203, |
|
"grad_norm": 0.34718258949903247, |
|
"learning_rate": 0.00017417711430453897, |
|
"loss": 0.2724, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.3121266427718041, |
|
"grad_norm": 0.35896609288857995, |
|
"learning_rate": 0.00017382646980190048, |
|
"loss": 0.2893, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.31362007168458783, |
|
"grad_norm": 0.4371043853081932, |
|
"learning_rate": 0.00017347381873714316, |
|
"loss": 0.2874, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.3151135005973716, |
|
"grad_norm": 0.33305502199179604, |
|
"learning_rate": 0.00017311917069512555, |
|
"loss": 0.2329, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.31660692951015534, |
|
"grad_norm": 0.43013382325716004, |
|
"learning_rate": 0.00017276253531498293, |
|
"loss": 0.312, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.31810035842293904, |
|
"grad_norm": 0.347929106798288, |
|
"learning_rate": 0.00017240392228986518, |
|
"loss": 0.2648, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.3195937873357228, |
|
"grad_norm": 0.3492181786101714, |
|
"learning_rate": 0.00017204334136667365, |
|
"loss": 0.2821, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.32108721624850656, |
|
"grad_norm": 0.45792503261610795, |
|
"learning_rate": 0.0001716808023457959, |
|
"loss": 0.3277, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.3225806451612903, |
|
"grad_norm": 0.3654545092753955, |
|
"learning_rate": 0.00017131631508083962, |
|
"loss": 0.2675, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.32407407407407407, |
|
"grad_norm": 0.37947758138241466, |
|
"learning_rate": 0.0001709498894783646, |
|
"loss": 0.2893, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.3255675029868578, |
|
"grad_norm": 0.4068572595536966, |
|
"learning_rate": 0.0001705815354976135, |
|
"loss": 0.2435, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.3270609318996416, |
|
"grad_norm": 0.4809381438376721, |
|
"learning_rate": 0.00017021126315024145, |
|
"loss": 0.2985, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.32855436081242534, |
|
"grad_norm": 0.40059623315230986, |
|
"learning_rate": 0.00016983908250004344, |
|
"loss": 0.273, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.3300477897252091, |
|
"grad_norm": 0.4052544383737238, |
|
"learning_rate": 0.00016946500366268123, |
|
"loss": 0.2433, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.33154121863799285, |
|
"grad_norm": 0.3560399096095347, |
|
"learning_rate": 0.0001690890368054082, |
|
"loss": 0.2932, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.3330346475507766, |
|
"grad_norm": 0.4156230656107325, |
|
"learning_rate": 0.00016871119214679304, |
|
"loss": 0.2685, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.3345280764635603, |
|
"grad_norm": 0.3271097281515262, |
|
"learning_rate": 0.00016833147995644202, |
|
"loss": 0.3102, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.33602150537634407, |
|
"grad_norm": 0.4217178620957521, |
|
"learning_rate": 0.00016794991055471992, |
|
"loss": 0.293, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.3375149342891278, |
|
"grad_norm": 0.37416501140273456, |
|
"learning_rate": 0.00016756649431246953, |
|
"loss": 0.2479, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.3390083632019116, |
|
"grad_norm": 0.45055007223505494, |
|
"learning_rate": 0.00016718124165072953, |
|
"loss": 0.2836, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.34050179211469533, |
|
"grad_norm": 0.40163491862797124, |
|
"learning_rate": 0.0001667941630404517, |
|
"loss": 0.2484, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.3419952210274791, |
|
"grad_norm": 0.41798757856275687, |
|
"learning_rate": 0.00016640526900221593, |
|
"loss": 0.2881, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.34348864994026285, |
|
"grad_norm": 0.38258918057820673, |
|
"learning_rate": 0.00016601457010594447, |
|
"loss": 0.3107, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.3449820788530466, |
|
"grad_norm": 0.3845197614655172, |
|
"learning_rate": 0.0001656220769706146, |
|
"loss": 0.2848, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.34647550776583036, |
|
"grad_norm": 0.3985931675974656, |
|
"learning_rate": 0.00016522780026397, |
|
"loss": 0.2887, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.3479689366786141, |
|
"grad_norm": 0.35770417481369043, |
|
"learning_rate": 0.00016483175070223081, |
|
"loss": 0.2504, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.34946236559139787, |
|
"grad_norm": 0.4093106514843705, |
|
"learning_rate": 0.00016443393904980242, |
|
"loss": 0.2898, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.35095579450418163, |
|
"grad_norm": 0.4539044394175168, |
|
"learning_rate": 0.00016403437611898282, |
|
"loss": 0.2809, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.35244922341696533, |
|
"grad_norm": 0.36797578012073034, |
|
"learning_rate": 0.0001636330727696688, |
|
"loss": 0.2529, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.3539426523297491, |
|
"grad_norm": 0.3182084185670439, |
|
"learning_rate": 0.00016323003990906072, |
|
"loss": 0.3142, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.35543608124253284, |
|
"grad_norm": 0.33781054620192646, |
|
"learning_rate": 0.00016282528849136612, |
|
"loss": 0.2403, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.3569295101553166, |
|
"grad_norm": 0.3880915127154374, |
|
"learning_rate": 0.0001624188295175019, |
|
"loss": 0.2999, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.35842293906810035, |
|
"grad_norm": 0.359774759483394, |
|
"learning_rate": 0.00016201067403479543, |
|
"loss": 0.3101, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.35842293906810035, |
|
"eval_loss": 0.2727881968021393, |
|
"eval_runtime": 672.0857, |
|
"eval_samples_per_second": 5.952, |
|
"eval_steps_per_second": 0.744, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.3599163679808841, |
|
"grad_norm": 0.388508285355121, |
|
"learning_rate": 0.0001616008331366843, |
|
"loss": 0.2564, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.36140979689366787, |
|
"grad_norm": 0.36408140890196194, |
|
"learning_rate": 0.00016118931796241457, |
|
"loss": 0.3252, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.3629032258064516, |
|
"grad_norm": 0.41167492762052504, |
|
"learning_rate": 0.0001607761396967384, |
|
"loss": 0.26, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.3643966547192354, |
|
"grad_norm": 0.5248867549730605, |
|
"learning_rate": 0.00016036130956960967, |
|
"loss": 0.2655, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.36589008363201914, |
|
"grad_norm": 0.49678835879333444, |
|
"learning_rate": 0.00015994483885587902, |
|
"loss": 0.3085, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.3673835125448029, |
|
"grad_norm": 0.3988902390012257, |
|
"learning_rate": 0.0001595267388749873, |
|
"loss": 0.2607, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.3688769414575866, |
|
"grad_norm": 0.3925797006591169, |
|
"learning_rate": 0.0001591070209906579, |
|
"loss": 0.2997, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.37037037037037035, |
|
"grad_norm": 0.3107954625386247, |
|
"learning_rate": 0.000158685696610588, |
|
"loss": 0.2537, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.3718637992831541, |
|
"grad_norm": 0.41712837863937413, |
|
"learning_rate": 0.00015826277718613833, |
|
"loss": 0.3003, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.37335722819593786, |
|
"grad_norm": 0.3630150573236221, |
|
"learning_rate": 0.00015783827421202214, |
|
"loss": 0.3056, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.3748506571087216, |
|
"grad_norm": 0.38258678014490427, |
|
"learning_rate": 0.00015741219922599253, |
|
"loss": 0.2641, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.3763440860215054, |
|
"grad_norm": 0.42701857998693166, |
|
"learning_rate": 0.00015698456380852915, |
|
"loss": 0.3068, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.37783751493428913, |
|
"grad_norm": 0.4380649167022928, |
|
"learning_rate": 0.00015655537958252324, |
|
"loss": 0.2576, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.3793309438470729, |
|
"grad_norm": 0.4082068880960036, |
|
"learning_rate": 0.00015612465821296175, |
|
"loss": 0.2953, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.38082437275985664, |
|
"grad_norm": 0.4190875138856109, |
|
"learning_rate": 0.00015569241140661047, |
|
"loss": 0.3032, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.3823178016726404, |
|
"grad_norm": 0.36568309245691916, |
|
"learning_rate": 0.0001552586509116955, |
|
"loss": 0.2195, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.38381123058542416, |
|
"grad_norm": 0.44216085615223133, |
|
"learning_rate": 0.0001548233885175843, |
|
"loss": 0.3073, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.38530465949820786, |
|
"grad_norm": 0.24922873576347065, |
|
"learning_rate": 0.00015438663605446507, |
|
"loss": 0.2472, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.3867980884109916, |
|
"grad_norm": 0.41662099994810825, |
|
"learning_rate": 0.00015394840539302527, |
|
"loss": 0.2868, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.38829151732377537, |
|
"grad_norm": 0.39646098577140865, |
|
"learning_rate": 0.0001535087084441289, |
|
"loss": 0.2855, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.3897849462365591, |
|
"grad_norm": 0.36239571418608835, |
|
"learning_rate": 0.00015306755715849293, |
|
"loss": 0.227, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.3912783751493429, |
|
"grad_norm": 0.35793958618986693, |
|
"learning_rate": 0.0001526249635263623, |
|
"loss": 0.2888, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.39277180406212664, |
|
"grad_norm": 0.3783233493544514, |
|
"learning_rate": 0.00015218093957718415, |
|
"loss": 0.2605, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.3942652329749104, |
|
"grad_norm": 0.37987257135077446, |
|
"learning_rate": 0.00015173549737928084, |
|
"loss": 0.2854, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.39575866188769415, |
|
"grad_norm": 0.3892605729869642, |
|
"learning_rate": 0.0001512886490395219, |
|
"loss": 0.2918, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.3972520908004779, |
|
"grad_norm": 0.41196692553982217, |
|
"learning_rate": 0.00015084040670299516, |
|
"loss": 0.2557, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.39874551971326166, |
|
"grad_norm": 0.3254462520604089, |
|
"learning_rate": 0.00015039078255267628, |
|
"loss": 0.3088, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.4002389486260454, |
|
"grad_norm": 0.40781344675681797, |
|
"learning_rate": 0.00014993978880909796, |
|
"loss": 0.2805, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.4017323775388292, |
|
"grad_norm": 0.39304465117259413, |
|
"learning_rate": 0.00014948743773001772, |
|
"loss": 0.2843, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.4032258064516129, |
|
"grad_norm": 0.48556741767644995, |
|
"learning_rate": 0.00014903374161008464, |
|
"loss": 0.3051, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.40471923536439663, |
|
"grad_norm": 0.3663553575319031, |
|
"learning_rate": 0.0001485787127805052, |
|
"loss": 0.2336, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.4062126642771804, |
|
"grad_norm": 0.4541411950152405, |
|
"learning_rate": 0.00014812236360870834, |
|
"loss": 0.2854, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.40770609318996415, |
|
"grad_norm": 0.34795001206833953, |
|
"learning_rate": 0.00014766470649800904, |
|
"loss": 0.26, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.4091995221027479, |
|
"grad_norm": 0.35146295322347754, |
|
"learning_rate": 0.00014720575388727132, |
|
"loss": 0.266, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.41069295101553166, |
|
"grad_norm": 0.44633677983363984, |
|
"learning_rate": 0.00014674551825057024, |
|
"loss": 0.2853, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.4121863799283154, |
|
"grad_norm": 0.3830303196674387, |
|
"learning_rate": 0.0001462840120968527, |
|
"loss": 0.2515, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.4136798088410992, |
|
"grad_norm": 0.39604955470206826, |
|
"learning_rate": 0.00014582124796959765, |
|
"loss": 0.2751, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.41517323775388293, |
|
"grad_norm": 0.3633797549153069, |
|
"learning_rate": 0.00014535723844647503, |
|
"loss": 0.2531, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.4166666666666667, |
|
"grad_norm": 0.6020608532163392, |
|
"learning_rate": 0.00014489199613900385, |
|
"loss": 0.2766, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.41816009557945044, |
|
"grad_norm": 0.38363605454823274, |
|
"learning_rate": 0.0001444255336922095, |
|
"loss": 0.2913, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.41816009557945044, |
|
"eval_loss": 0.2678522765636444, |
|
"eval_runtime": 671.8286, |
|
"eval_samples_per_second": 5.954, |
|
"eval_steps_per_second": 0.744, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.41965352449223414, |
|
"grad_norm": 0.34726800590023943, |
|
"learning_rate": 0.00014395786378428033, |
|
"loss": 0.2308, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 0.4211469534050179, |
|
"grad_norm": 0.3445589635670954, |
|
"learning_rate": 0.00014348899912622248, |
|
"loss": 0.306, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.42264038231780165, |
|
"grad_norm": 0.3573121569173274, |
|
"learning_rate": 0.0001430189524615149, |
|
"loss": 0.2669, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 0.4241338112305854, |
|
"grad_norm": 0.40102571413373606, |
|
"learning_rate": 0.0001425477365657628, |
|
"loss": 0.2701, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.42562724014336917, |
|
"grad_norm": 0.3591108084356434, |
|
"learning_rate": 0.0001420753642463504, |
|
"loss": 0.2866, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.4271206690561529, |
|
"grad_norm": 0.38049883203567986, |
|
"learning_rate": 0.00014160184834209296, |
|
"loss": 0.267, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.4286140979689367, |
|
"grad_norm": 0.34901096097918405, |
|
"learning_rate": 0.0001411272017228876, |
|
"loss": 0.2772, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.43010752688172044, |
|
"grad_norm": 0.43110083401740446, |
|
"learning_rate": 0.0001406514372893637, |
|
"loss": 0.2773, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.4316009557945042, |
|
"grad_norm": 0.49928896541185935, |
|
"learning_rate": 0.0001401745679725323, |
|
"loss": 0.3121, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 0.43309438470728795, |
|
"grad_norm": 0.38753869075919367, |
|
"learning_rate": 0.00013969660673343435, |
|
"loss": 0.2809, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.4345878136200717, |
|
"grad_norm": 0.431047229984796, |
|
"learning_rate": 0.00013921756656278884, |
|
"loss": 0.246, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.43608124253285546, |
|
"grad_norm": 0.33787432704790127, |
|
"learning_rate": 0.00013873746048063943, |
|
"loss": 0.2905, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.43757467144563916, |
|
"grad_norm": 0.5450443166622904, |
|
"learning_rate": 0.00013825630153600058, |
|
"loss": 0.2439, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 0.4390681003584229, |
|
"grad_norm": 0.46991031442183334, |
|
"learning_rate": 0.0001377741028065031, |
|
"loss": 0.2916, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.4405615292712067, |
|
"grad_norm": 0.43373701568669226, |
|
"learning_rate": 0.00013729087739803846, |
|
"loss": 0.3018, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.44205495818399043, |
|
"grad_norm": 0.41610864937340164, |
|
"learning_rate": 0.0001368066384444028, |
|
"loss": 0.2252, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.4435483870967742, |
|
"grad_norm": 0.3708263427774675, |
|
"learning_rate": 0.0001363213991069397, |
|
"loss": 0.2644, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 0.44504181600955794, |
|
"grad_norm": 0.37850068431288725, |
|
"learning_rate": 0.00013583517257418278, |
|
"loss": 0.2641, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.4465352449223417, |
|
"grad_norm": 0.4878849200428849, |
|
"learning_rate": 0.000135347972061497, |
|
"loss": 0.2722, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 0.44802867383512546, |
|
"grad_norm": 0.3890166450566535, |
|
"learning_rate": 0.00013485981081071949, |
|
"loss": 0.2876, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.4495221027479092, |
|
"grad_norm": 0.4698494336715817, |
|
"learning_rate": 0.00013437070208979974, |
|
"loss": 0.2618, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 0.45101553166069297, |
|
"grad_norm": 0.42945392994548376, |
|
"learning_rate": 0.000133880659192439, |
|
"loss": 0.2887, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.4525089605734767, |
|
"grad_norm": 0.36608609898718975, |
|
"learning_rate": 0.00013338969543772892, |
|
"loss": 0.2309, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 0.4540023894862604, |
|
"grad_norm": 0.31719911951791674, |
|
"learning_rate": 0.00013289782416978947, |
|
"loss": 0.2669, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.4554958183990442, |
|
"grad_norm": 0.44642248141095303, |
|
"learning_rate": 0.0001324050587574063, |
|
"loss": 0.2638, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.45698924731182794, |
|
"grad_norm": 0.4094419878172264, |
|
"learning_rate": 0.0001319114125936675, |
|
"loss": 0.2647, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.4584826762246117, |
|
"grad_norm": 0.48369368762161424, |
|
"learning_rate": 0.00013141689909559943, |
|
"loss": 0.298, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 0.45997610513739545, |
|
"grad_norm": 0.4137701154348147, |
|
"learning_rate": 0.00013092153170380212, |
|
"loss": 0.2706, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.4614695340501792, |
|
"grad_norm": 0.4125118524580438, |
|
"learning_rate": 0.00013042532388208398, |
|
"loss": 0.2769, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 0.46296296296296297, |
|
"grad_norm": 0.3835331195333388, |
|
"learning_rate": 0.0001299282891170958, |
|
"loss": 0.29, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.4644563918757467, |
|
"grad_norm": 0.4050806911084048, |
|
"learning_rate": 0.00012943044091796418, |
|
"loss": 0.2733, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 0.4659498207885305, |
|
"grad_norm": 0.3686929748618163, |
|
"learning_rate": 0.00012893179281592453, |
|
"loss": 0.299, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.46744324970131423, |
|
"grad_norm": 0.3713013220008561, |
|
"learning_rate": 0.0001284323583639531, |
|
"loss": 0.2514, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 0.468936678614098, |
|
"grad_norm": 0.37363712594365256, |
|
"learning_rate": 0.00012793215113639862, |
|
"loss": 0.278, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.47043010752688175, |
|
"grad_norm": 0.3759963829551066, |
|
"learning_rate": 0.00012743118472861365, |
|
"loss": 0.2819, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.47192353643966545, |
|
"grad_norm": 0.38520117361685985, |
|
"learning_rate": 0.00012692947275658455, |
|
"loss": 0.2345, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.4734169653524492, |
|
"grad_norm": 0.3075498245821686, |
|
"learning_rate": 0.00012642702885656192, |
|
"loss": 0.263, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 0.47491039426523296, |
|
"grad_norm": 0.3841403461793403, |
|
"learning_rate": 0.00012592386668468968, |
|
"loss": 0.2913, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.4764038231780167, |
|
"grad_norm": 0.3367807617111592, |
|
"learning_rate": 0.00012541999991663388, |
|
"loss": 0.2801, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 0.4778972520908005, |
|
"grad_norm": 0.378016075471954, |
|
"learning_rate": 0.00012491544224721136, |
|
"loss": 0.2765, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.4778972520908005, |
|
"eval_loss": 0.2625010311603546, |
|
"eval_runtime": 671.1626, |
|
"eval_samples_per_second": 5.96, |
|
"eval_steps_per_second": 0.745, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.47939068100358423, |
|
"grad_norm": 0.3691633610486676, |
|
"learning_rate": 0.00012441020739001698, |
|
"loss": 0.2793, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 0.480884109916368, |
|
"grad_norm": 0.3264004788515136, |
|
"learning_rate": 0.00012390430907705134, |
|
"loss": 0.2683, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.48237753882915174, |
|
"grad_norm": 0.35913080769203726, |
|
"learning_rate": 0.00012339776105834744, |
|
"loss": 0.229, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 0.4838709677419355, |
|
"grad_norm": 0.45935643829710515, |
|
"learning_rate": 0.00012289057710159672, |
|
"loss": 0.2714, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.48536439665471925, |
|
"grad_norm": 0.3879403177186618, |
|
"learning_rate": 0.00012238277099177526, |
|
"loss": 0.2829, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.486857825567503, |
|
"grad_norm": 0.3514395181766118, |
|
"learning_rate": 0.00012187435653076889, |
|
"loss": 0.249, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.4883512544802867, |
|
"grad_norm": 0.3914277195367492, |
|
"learning_rate": 0.0001213653475369979, |
|
"loss": 0.2862, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 0.48984468339307047, |
|
"grad_norm": 0.468390381134599, |
|
"learning_rate": 0.00012085575784504191, |
|
"loss": 0.2307, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.4913381123058542, |
|
"grad_norm": 0.42676427345496687, |
|
"learning_rate": 0.0001203456013052634, |
|
"loss": 0.2963, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 0.492831541218638, |
|
"grad_norm": 0.4042777140715105, |
|
"learning_rate": 0.0001198348917834315, |
|
"loss": 0.3155, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.49432497013142174, |
|
"grad_norm": 0.333904149664281, |
|
"learning_rate": 0.00011932364316034514, |
|
"loss": 0.2433, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 0.4958183990442055, |
|
"grad_norm": 0.41506264824806743, |
|
"learning_rate": 0.0001188118693314557, |
|
"loss": 0.2806, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.49731182795698925, |
|
"grad_norm": 0.4293221243565824, |
|
"learning_rate": 0.0001182995842064893, |
|
"loss": 0.2582, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 0.498805256869773, |
|
"grad_norm": 0.39173839921627196, |
|
"learning_rate": 0.00011778680170906888, |
|
"loss": 0.2886, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.5002986857825568, |
|
"grad_norm": 0.47362247526518175, |
|
"learning_rate": 0.0001172735357763356, |
|
"loss": 0.293, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.5017921146953405, |
|
"grad_norm": 0.4176472433369269, |
|
"learning_rate": 0.00011675980035857019, |
|
"loss": 0.2091, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.5032855436081243, |
|
"grad_norm": 0.39087371326818915, |
|
"learning_rate": 0.00011624560941881371, |
|
"loss": 0.3164, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 0.504778972520908, |
|
"grad_norm": 0.3444686695813055, |
|
"learning_rate": 0.00011573097693248805, |
|
"loss": 0.2601, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.5062724014336918, |
|
"grad_norm": 0.3629126474420232, |
|
"learning_rate": 0.00011521591688701605, |
|
"loss": 0.2677, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 0.5077658303464755, |
|
"grad_norm": 0.3677730257235259, |
|
"learning_rate": 0.00011470044328144143, |
|
"loss": 0.2845, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.5092592592592593, |
|
"grad_norm": 0.4484524597840905, |
|
"learning_rate": 0.0001141845701260482, |
|
"loss": 0.2535, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 0.510752688172043, |
|
"grad_norm": 0.44520906295668633, |
|
"learning_rate": 0.0001136683114419799, |
|
"loss": 0.2923, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.5122461170848268, |
|
"grad_norm": 0.3843678110888455, |
|
"learning_rate": 0.00011315168126085857, |
|
"loss": 0.2419, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 0.5137395459976105, |
|
"grad_norm": 0.3424866175556353, |
|
"learning_rate": 0.00011263469362440331, |
|
"loss": 0.2722, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.5152329749103942, |
|
"grad_norm": 0.3664503507774818, |
|
"learning_rate": 0.00011211736258404864, |
|
"loss": 0.2652, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.516726403823178, |
|
"grad_norm": 0.3882647311598004, |
|
"learning_rate": 0.00011159970220056262, |
|
"loss": 0.2613, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.5182198327359617, |
|
"grad_norm": 0.3934259702377591, |
|
"learning_rate": 0.00011108172654366467, |
|
"loss": 0.2744, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 0.5197132616487455, |
|
"grad_norm": 0.38348245744708515, |
|
"learning_rate": 0.00011056344969164317, |
|
"loss": 0.2484, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.5212066905615292, |
|
"grad_norm": 0.4552854124473697, |
|
"learning_rate": 0.0001100448857309728, |
|
"loss": 0.2559, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 0.522700119474313, |
|
"grad_norm": 0.39439256103140385, |
|
"learning_rate": 0.00010952604875593171, |
|
"loss": 0.2724, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.5241935483870968, |
|
"grad_norm": 0.42701555614159337, |
|
"learning_rate": 0.00010900695286821843, |
|
"loss": 0.2284, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 0.5256869772998806, |
|
"grad_norm": 0.3685725297263498, |
|
"learning_rate": 0.00010848761217656856, |
|
"loss": 0.2897, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.5271804062126643, |
|
"grad_norm": 0.40798420628174437, |
|
"learning_rate": 0.00010796804079637137, |
|
"loss": 0.2689, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 0.5286738351254481, |
|
"grad_norm": 0.4141834988836477, |
|
"learning_rate": 0.0001074482528492861, |
|
"loss": 0.2833, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.5301672640382318, |
|
"grad_norm": 0.391053467201034, |
|
"learning_rate": 0.00010692826246285815, |
|
"loss": 0.2783, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 0.5316606929510155, |
|
"grad_norm": 0.3660707237272847, |
|
"learning_rate": 0.00010640808377013509, |
|
"loss": 0.2504, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.5331541218637993, |
|
"grad_norm": 0.29960550920174905, |
|
"learning_rate": 0.00010588773090928268, |
|
"loss": 0.2773, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 0.534647550776583, |
|
"grad_norm": 0.49371001212196336, |
|
"learning_rate": 0.00010536721802320027, |
|
"loss": 0.2543, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.5361409796893668, |
|
"grad_norm": 0.381676599509551, |
|
"learning_rate": 0.00010484655925913669, |
|
"loss": 0.3047, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 0.5376344086021505, |
|
"grad_norm": 0.4165684073566037, |
|
"learning_rate": 0.00010432576876830576, |
|
"loss": 0.2697, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.5376344086021505, |
|
"eval_loss": 0.2601146101951599, |
|
"eval_runtime": 671.0756, |
|
"eval_samples_per_second": 5.961, |
|
"eval_steps_per_second": 0.745, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.5391278375149343, |
|
"grad_norm": 0.34164713060339763, |
|
"learning_rate": 0.00010380486070550135, |
|
"loss": 0.2613, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 0.540621266427718, |
|
"grad_norm": 0.3608068205239928, |
|
"learning_rate": 0.00010328384922871307, |
|
"loss": 0.3023, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.5421146953405018, |
|
"grad_norm": 0.4134220541761705, |
|
"learning_rate": 0.0001027627484987412, |
|
"loss": 0.2759, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 0.5436081242532855, |
|
"grad_norm": 0.4380879213819027, |
|
"learning_rate": 0.00010224157267881176, |
|
"loss": 0.2999, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.5451015531660693, |
|
"grad_norm": 0.41334705402523975, |
|
"learning_rate": 0.00010172033593419184, |
|
"loss": 0.2905, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 0.546594982078853, |
|
"grad_norm": 0.33573837522797895, |
|
"learning_rate": 0.00010119905243180432, |
|
"loss": 0.2197, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.5480884109916367, |
|
"grad_norm": 0.34530007854548944, |
|
"learning_rate": 0.00010067773633984294, |
|
"loss": 0.2838, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 0.5495818399044206, |
|
"grad_norm": 0.39282261233632176, |
|
"learning_rate": 0.00010015640182738733, |
|
"loss": 0.2449, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.5510752688172043, |
|
"grad_norm": 0.369443654894832, |
|
"learning_rate": 9.963506306401753e-05, |
|
"loss": 0.2641, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 0.5525686977299881, |
|
"grad_norm": 0.3736606416875578, |
|
"learning_rate": 9.911373421942939e-05, |
|
"loss": 0.2877, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.5540621266427718, |
|
"grad_norm": 0.4082824771004434, |
|
"learning_rate": 9.859242946304903e-05, |
|
"loss": 0.2462, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 0.5555555555555556, |
|
"grad_norm": 0.37234188557058073, |
|
"learning_rate": 9.807116296364783e-05, |
|
"loss": 0.2647, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.5570489844683393, |
|
"grad_norm": 0.3373920413107433, |
|
"learning_rate": 9.754994888895744e-05, |
|
"loss": 0.2481, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 0.5585424133811231, |
|
"grad_norm": 0.3691166711599759, |
|
"learning_rate": 9.702880140528466e-05, |
|
"loss": 0.2651, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.5600358422939068, |
|
"grad_norm": 0.4237027962737052, |
|
"learning_rate": 9.650773467712632e-05, |
|
"loss": 0.2976, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 0.5615292712066906, |
|
"grad_norm": 0.3628296244980769, |
|
"learning_rate": 9.598676286678434e-05, |
|
"loss": 0.233, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.5630227001194743, |
|
"grad_norm": 0.32168405334156497, |
|
"learning_rate": 9.546590013398091e-05, |
|
"loss": 0.2875, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 0.5645161290322581, |
|
"grad_norm": 0.4011755917649591, |
|
"learning_rate": 9.494516063547353e-05, |
|
"loss": 0.2556, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.5660095579450418, |
|
"grad_norm": 0.4531648719387392, |
|
"learning_rate": 9.442455852467019e-05, |
|
"loss": 0.2674, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 0.5675029868578255, |
|
"grad_norm": 0.4603202269611572, |
|
"learning_rate": 9.390410795124484e-05, |
|
"loss": 0.2902, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.5689964157706093, |
|
"grad_norm": 0.3873591567262209, |
|
"learning_rate": 9.338382306075274e-05, |
|
"loss": 0.2272, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 0.570489844683393, |
|
"grad_norm": 0.41809794251121246, |
|
"learning_rate": 9.286371799424584e-05, |
|
"loss": 0.309, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.5719832735961768, |
|
"grad_norm": 0.41735588072251223, |
|
"learning_rate": 9.234380688788879e-05, |
|
"loss": 0.2422, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 0.5734767025089605, |
|
"grad_norm": 0.35987074653242784, |
|
"learning_rate": 9.182410387257434e-05, |
|
"loss": 0.2868, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.5749701314217444, |
|
"grad_norm": 0.3381000166870348, |
|
"learning_rate": 9.130462307353945e-05, |
|
"loss": 0.2794, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 0.5764635603345281, |
|
"grad_norm": 0.3412056255088943, |
|
"learning_rate": 9.078537860998155e-05, |
|
"loss": 0.2391, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.5779569892473119, |
|
"grad_norm": 0.40246570101157014, |
|
"learning_rate": 9.026638459467435e-05, |
|
"loss": 0.2757, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 0.5794504181600956, |
|
"grad_norm": 0.29830016511773766, |
|
"learning_rate": 8.974765513358466e-05, |
|
"loss": 0.2403, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.5809438470728794, |
|
"grad_norm": 0.3411808872796586, |
|
"learning_rate": 8.92292043254889e-05, |
|
"loss": 0.2708, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 0.5824372759856631, |
|
"grad_norm": 0.4133927108399512, |
|
"learning_rate": 8.871104626158972e-05, |
|
"loss": 0.2873, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.5839307048984468, |
|
"grad_norm": 0.39290137528322705, |
|
"learning_rate": 8.81931950251332e-05, |
|
"loss": 0.2103, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 0.5854241338112306, |
|
"grad_norm": 0.45501182269960977, |
|
"learning_rate": 8.767566469102613e-05, |
|
"loss": 0.2781, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.5869175627240143, |
|
"grad_norm": 0.4189475808727597, |
|
"learning_rate": 8.715846932545317e-05, |
|
"loss": 0.2378, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 0.5884109916367981, |
|
"grad_norm": 0.4085112647747901, |
|
"learning_rate": 8.664162298549483e-05, |
|
"loss": 0.2806, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.5899044205495818, |
|
"grad_norm": 0.3732596924042426, |
|
"learning_rate": 8.612513971874534e-05, |
|
"loss": 0.2932, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 0.5913978494623656, |
|
"grad_norm": 0.35250262119931725, |
|
"learning_rate": 8.560903356293069e-05, |
|
"loss": 0.2418, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.5928912783751493, |
|
"grad_norm": 0.41366159645877715, |
|
"learning_rate": 8.509331854552724e-05, |
|
"loss": 0.2743, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 0.5943847072879331, |
|
"grad_norm": 0.3464575423602374, |
|
"learning_rate": 8.457800868338051e-05, |
|
"loss": 0.2222, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.5958781362007168, |
|
"grad_norm": 0.360808643210085, |
|
"learning_rate": 8.406311798232408e-05, |
|
"loss": 0.2625, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 0.5973715651135006, |
|
"grad_norm": 0.4208234863436116, |
|
"learning_rate": 8.354866043679887e-05, |
|
"loss": 0.2759, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.5973715651135006, |
|
"eval_loss": 0.25568902492523193, |
|
"eval_runtime": 670.9224, |
|
"eval_samples_per_second": 5.962, |
|
"eval_steps_per_second": 0.745, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.5988649940262843, |
|
"grad_norm": 0.5128576614444417, |
|
"learning_rate": 8.303465002947302e-05, |
|
"loss": 0.2265, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 0.600358422939068, |
|
"grad_norm": 0.4246792251104683, |
|
"learning_rate": 8.252110073086165e-05, |
|
"loss": 0.2591, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.6018518518518519, |
|
"grad_norm": 0.36587149536697655, |
|
"learning_rate": 8.200802649894713e-05, |
|
"loss": 0.2598, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 0.6033452807646356, |
|
"grad_norm": 0.3560153774291601, |
|
"learning_rate": 8.149544127879988e-05, |
|
"loss": 0.26, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.6048387096774194, |
|
"grad_norm": 0.3820268167566159, |
|
"learning_rate": 8.098335900219929e-05, |
|
"loss": 0.2812, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 0.6063321385902031, |
|
"grad_norm": 0.4670838320418128, |
|
"learning_rate": 8.047179358725487e-05, |
|
"loss": 0.2398, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.6078255675029869, |
|
"grad_norm": 0.4359410281219078, |
|
"learning_rate": 7.99607589380282e-05, |
|
"loss": 0.2973, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 0.6093189964157706, |
|
"grad_norm": 0.3406561530367968, |
|
"learning_rate": 7.945026894415504e-05, |
|
"loss": 0.251, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.6108124253285544, |
|
"grad_norm": 0.34818876087607525, |
|
"learning_rate": 7.894033748046756e-05, |
|
"loss": 0.2691, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 0.6123058542413381, |
|
"grad_norm": 0.39942001844135094, |
|
"learning_rate": 7.843097840661748e-05, |
|
"loss": 0.2879, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.6137992831541219, |
|
"grad_norm": 0.3618592441449212, |
|
"learning_rate": 7.792220556669929e-05, |
|
"loss": 0.223, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 0.6152927120669056, |
|
"grad_norm": 0.4022080561354502, |
|
"learning_rate": 7.741403278887397e-05, |
|
"loss": 0.2785, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.6167861409796893, |
|
"grad_norm": 0.26523185523371673, |
|
"learning_rate": 7.690647388499305e-05, |
|
"loss": 0.2482, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 0.6182795698924731, |
|
"grad_norm": 0.33411219118251206, |
|
"learning_rate": 7.639954265022349e-05, |
|
"loss": 0.2703, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.6197729988052568, |
|
"grad_norm": 0.35999133636969094, |
|
"learning_rate": 7.589325286267241e-05, |
|
"loss": 0.262, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 0.6212664277180406, |
|
"grad_norm": 0.3853527518287356, |
|
"learning_rate": 7.538761828301276e-05, |
|
"loss": 0.2261, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.6227598566308243, |
|
"grad_norm": 0.43335074980788474, |
|
"learning_rate": 7.48826526541094e-05, |
|
"loss": 0.254, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 0.6242532855436081, |
|
"grad_norm": 0.3542418988641108, |
|
"learning_rate": 7.437836970064545e-05, |
|
"loss": 0.2334, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.6257467144563919, |
|
"grad_norm": 0.398192032214991, |
|
"learning_rate": 7.387478312874916e-05, |
|
"loss": 0.2742, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 0.6272401433691757, |
|
"grad_norm": 0.33547770368905083, |
|
"learning_rate": 7.337190662562174e-05, |
|
"loss": 0.2511, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.6287335722819594, |
|
"grad_norm": 0.4179386434535002, |
|
"learning_rate": 7.2869753859165e-05, |
|
"loss": 0.2667, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 0.6302270011947432, |
|
"grad_norm": 0.36708628994003195, |
|
"learning_rate": 7.236833847760994e-05, |
|
"loss": 0.2805, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.6317204301075269, |
|
"grad_norm": 0.3128895549559772, |
|
"learning_rate": 7.186767410914601e-05, |
|
"loss": 0.2477, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 0.6332138590203107, |
|
"grad_norm": 0.3308033239038678, |
|
"learning_rate": 7.136777436155041e-05, |
|
"loss": 0.2559, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.6347072879330944, |
|
"grad_norm": 0.36157519220837353, |
|
"learning_rate": 7.086865282181841e-05, |
|
"loss": 0.2747, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 0.6362007168458781, |
|
"grad_norm": 0.38491703355424195, |
|
"learning_rate": 7.037032305579409e-05, |
|
"loss": 0.202, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.6376941457586619, |
|
"grad_norm": 0.29165681376515235, |
|
"learning_rate": 6.987279860780148e-05, |
|
"loss": 0.286, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 0.6391875746714456, |
|
"grad_norm": 0.3691280781000132, |
|
"learning_rate": 6.93760930002765e-05, |
|
"loss": 0.2628, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.6406810035842294, |
|
"grad_norm": 0.33721048070187576, |
|
"learning_rate": 6.888021973339958e-05, |
|
"loss": 0.2629, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 0.6421744324970131, |
|
"grad_norm": 0.40230756965561154, |
|
"learning_rate": 6.838519228472845e-05, |
|
"loss": 0.2919, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.6436678614097969, |
|
"grad_norm": 0.3832722542456181, |
|
"learning_rate": 6.789102410883201e-05, |
|
"loss": 0.2258, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 0.6451612903225806, |
|
"grad_norm": 0.3679304149313773, |
|
"learning_rate": 6.739772863692471e-05, |
|
"loss": 0.2684, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.6466547192353644, |
|
"grad_norm": 0.40756568956863426, |
|
"learning_rate": 6.690531927650126e-05, |
|
"loss": 0.2583, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 0.6481481481481481, |
|
"grad_norm": 0.3980654356736509, |
|
"learning_rate": 6.641380941097237e-05, |
|
"loss": 0.26, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.649641577060932, |
|
"grad_norm": 0.4297937372635352, |
|
"learning_rate": 6.592321239930112e-05, |
|
"loss": 0.2736, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 0.6511350059737157, |
|
"grad_norm": 0.358991865502727, |
|
"learning_rate": 6.54335415756396e-05, |
|
"loss": 0.2458, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.6526284348864994, |
|
"grad_norm": 0.35194743625956787, |
|
"learning_rate": 6.49448102489666e-05, |
|
"loss": 0.2731, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 0.6541218637992832, |
|
"grad_norm": 0.3344682852694039, |
|
"learning_rate": 6.445703170272603e-05, |
|
"loss": 0.2392, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.6556152927120669, |
|
"grad_norm": 0.36819816176424924, |
|
"learning_rate": 6.39702191944657e-05, |
|
"loss": 0.2742, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 0.6571087216248507, |
|
"grad_norm": 0.41481385734175824, |
|
"learning_rate": 6.348438595547705e-05, |
|
"loss": 0.264, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.6571087216248507, |
|
"eval_loss": 0.25236326456069946, |
|
"eval_runtime": 670.9223, |
|
"eval_samples_per_second": 5.962, |
|
"eval_steps_per_second": 0.745, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.6586021505376344, |
|
"grad_norm": 0.37828046273666066, |
|
"learning_rate": 6.299954519043552e-05, |
|
"loss": 0.2045, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 0.6600955794504182, |
|
"grad_norm": 0.3647111671687454, |
|
"learning_rate": 6.25157100770417e-05, |
|
"loss": 0.2757, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.6615890083632019, |
|
"grad_norm": 0.333924367566417, |
|
"learning_rate": 6.203289376566307e-05, |
|
"loss": 0.2334, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 0.6630824372759857, |
|
"grad_norm": 0.3731260608605994, |
|
"learning_rate": 6.155110937897667e-05, |
|
"loss": 0.2359, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.6645758661887694, |
|
"grad_norm": 0.36399635762642046, |
|
"learning_rate": 6.107037001161248e-05, |
|
"loss": 0.2509, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 0.6660692951015532, |
|
"grad_norm": 0.3990653767812247, |
|
"learning_rate": 6.0590688729797295e-05, |
|
"loss": 0.2521, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.6675627240143369, |
|
"grad_norm": 0.3608936095400094, |
|
"learning_rate": 6.011207857099983e-05, |
|
"loss": 0.2543, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 0.6690561529271206, |
|
"grad_norm": 0.31260618340047636, |
|
"learning_rate": 5.963455254357631e-05, |
|
"loss": 0.2318, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.6705495818399044, |
|
"grad_norm": 0.4642758261006183, |
|
"learning_rate": 5.9158123626416794e-05, |
|
"loss": 0.2754, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 0.6720430107526881, |
|
"grad_norm": 0.37892791122014974, |
|
"learning_rate": 5.868280476859249e-05, |
|
"loss": 0.2821, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.6735364396654719, |
|
"grad_norm": 0.317919026225424, |
|
"learning_rate": 5.820860888900392e-05, |
|
"loss": 0.2021, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 0.6750298685782556, |
|
"grad_norm": 0.3638464340699625, |
|
"learning_rate": 5.7735548876029597e-05, |
|
"loss": 0.2758, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.6765232974910395, |
|
"grad_norm": 0.4089378874713291, |
|
"learning_rate": 5.7263637587175836e-05, |
|
"loss": 0.2319, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 0.6780167264038232, |
|
"grad_norm": 0.36255601718231956, |
|
"learning_rate": 5.679288784872727e-05, |
|
"loss": 0.2736, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.679510155316607, |
|
"grad_norm": 0.4450766858081928, |
|
"learning_rate": 5.632331245539826e-05, |
|
"loss": 0.2682, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 0.6810035842293907, |
|
"grad_norm": 0.38190771534156615, |
|
"learning_rate": 5.585492416998511e-05, |
|
"loss": 0.2289, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.6824970131421745, |
|
"grad_norm": 0.4405779247739025, |
|
"learning_rate": 5.5387735723019207e-05, |
|
"loss": 0.2928, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 0.6839904420549582, |
|
"grad_norm": 0.3774472213099317, |
|
"learning_rate": 5.492175981242097e-05, |
|
"loss": 0.2514, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.6854838709677419, |
|
"grad_norm": 0.3377801375518953, |
|
"learning_rate": 5.445700910315481e-05, |
|
"loss": 0.2655, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 0.6869772998805257, |
|
"grad_norm": 0.39587982015232115, |
|
"learning_rate": 5.399349622688479e-05, |
|
"loss": 0.2946, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.6884707287933094, |
|
"grad_norm": 0.43119763327491156, |
|
"learning_rate": 5.353123378163143e-05, |
|
"loss": 0.2407, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 0.6899641577060932, |
|
"grad_norm": 0.36132290047057575, |
|
"learning_rate": 5.307023433142919e-05, |
|
"loss": 0.2807, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.6914575866188769, |
|
"grad_norm": 0.38966391503202974, |
|
"learning_rate": 5.2610510405985034e-05, |
|
"loss": 0.2187, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 0.6929510155316607, |
|
"grad_norm": 0.36997141842452064, |
|
"learning_rate": 5.21520745003379e-05, |
|
"loss": 0.2772, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.6944444444444444, |
|
"grad_norm": 0.4830585130129818, |
|
"learning_rate": 5.169493907451906e-05, |
|
"loss": 0.2846, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 0.6959378733572282, |
|
"grad_norm": 0.4663739708408958, |
|
"learning_rate": 5.123911655321345e-05, |
|
"loss": 0.2334, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.6974313022700119, |
|
"grad_norm": 0.38317110709020197, |
|
"learning_rate": 5.078461932542204e-05, |
|
"loss": 0.2903, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 0.6989247311827957, |
|
"grad_norm": 0.44204152745518926, |
|
"learning_rate": 5.033145974412502e-05, |
|
"loss": 0.2418, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.7004181600955794, |
|
"grad_norm": 0.3562540524906317, |
|
"learning_rate": 4.987965012594612e-05, |
|
"loss": 0.2429, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 0.7019115890083633, |
|
"grad_norm": 0.415882439063641, |
|
"learning_rate": 4.942920275081784e-05, |
|
"loss": 0.2716, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.703405017921147, |
|
"grad_norm": 0.4349967578566546, |
|
"learning_rate": 4.898012986164764e-05, |
|
"loss": 0.227, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 0.7048984468339307, |
|
"grad_norm": 0.38888281415830517, |
|
"learning_rate": 4.853244366398524e-05, |
|
"loss": 0.2727, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.7063918757467145, |
|
"grad_norm": 0.48748379250803325, |
|
"learning_rate": 4.808615632569089e-05, |
|
"loss": 0.2237, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 0.7078853046594982, |
|
"grad_norm": 0.33545458271479295, |
|
"learning_rate": 4.764127997660457e-05, |
|
"loss": 0.2512, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.709378733572282, |
|
"grad_norm": 0.4038148334793521, |
|
"learning_rate": 4.719782670821642e-05, |
|
"loss": 0.2729, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 0.7108721624850657, |
|
"grad_norm": 0.34079737026356866, |
|
"learning_rate": 4.675580857333799e-05, |
|
"loss": 0.2256, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.7123655913978495, |
|
"grad_norm": 0.46179757394318344, |
|
"learning_rate": 4.631523758577475e-05, |
|
"loss": 0.2722, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 0.7138590203106332, |
|
"grad_norm": 0.39463185516160715, |
|
"learning_rate": 4.5876125719999474e-05, |
|
"loss": 0.2632, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.715352449223417, |
|
"grad_norm": 0.370808339928303, |
|
"learning_rate": 4.543848491082687e-05, |
|
"loss": 0.2381, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 0.7168458781362007, |
|
"grad_norm": 0.3791124095428876, |
|
"learning_rate": 4.500232705308911e-05, |
|
"loss": 0.2705, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.7168458781362007, |
|
"eval_loss": 0.24903884530067444, |
|
"eval_runtime": 670.4879, |
|
"eval_samples_per_second": 5.966, |
|
"eval_steps_per_second": 0.746, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.7183393070489845, |
|
"grad_norm": 0.3775218725937434, |
|
"learning_rate": 4.45676640013126e-05, |
|
"loss": 0.2268, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 0.7198327359617682, |
|
"grad_norm": 0.3827463104057589, |
|
"learning_rate": 4.413450756939574e-05, |
|
"loss": 0.2536, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.7213261648745519, |
|
"grad_norm": 0.3648532710176232, |
|
"learning_rate": 4.370286953028787e-05, |
|
"loss": 0.2504, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 0.7228195937873357, |
|
"grad_norm": 0.3055219924416998, |
|
"learning_rate": 4.32727616156692e-05, |
|
"loss": 0.2324, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.7243130227001194, |
|
"grad_norm": 0.4438604649063271, |
|
"learning_rate": 4.2844195515632166e-05, |
|
"loss": 0.2589, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 0.7258064516129032, |
|
"grad_norm": 0.3795959005263556, |
|
"learning_rate": 4.2417182878363336e-05, |
|
"loss": 0.2476, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.727299880525687, |
|
"grad_norm": 0.3427145661133711, |
|
"learning_rate": 4.199173530982715e-05, |
|
"loss": 0.2829, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 0.7287933094384708, |
|
"grad_norm": 0.39445938386306895, |
|
"learning_rate": 4.156786437345044e-05, |
|
"loss": 0.2703, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.7302867383512545, |
|
"grad_norm": 0.39923700769389914, |
|
"learning_rate": 4.114558158980785e-05, |
|
"loss": 0.285, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 0.7317801672640383, |
|
"grad_norm": 0.4038674564871101, |
|
"learning_rate": 4.072489843630903e-05, |
|
"loss": 0.2814, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.733273596176822, |
|
"grad_norm": 0.36737797848087894, |
|
"learning_rate": 4.030582634688669e-05, |
|
"loss": 0.2397, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 0.7347670250896058, |
|
"grad_norm": 0.33021542644527924, |
|
"learning_rate": 3.98883767116855e-05, |
|
"loss": 0.2415, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.7362604540023895, |
|
"grad_norm": 0.4047149773056364, |
|
"learning_rate": 3.947256087675286e-05, |
|
"loss": 0.2636, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 0.7377538829151732, |
|
"grad_norm": 0.418965527019863, |
|
"learning_rate": 3.905839014373047e-05, |
|
"loss": 0.2818, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.739247311827957, |
|
"grad_norm": 0.44764561078371773, |
|
"learning_rate": 3.864587576954688e-05, |
|
"loss": 0.2781, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 0.7407407407407407, |
|
"grad_norm": 0.39101875860430557, |
|
"learning_rate": 3.823502896611184e-05, |
|
"loss": 0.2577, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.7422341696535245, |
|
"grad_norm": 0.3922118709138838, |
|
"learning_rate": 3.782586090001155e-05, |
|
"loss": 0.2563, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 0.7437275985663082, |
|
"grad_norm": 0.379882936966773, |
|
"learning_rate": 3.741838269220496e-05, |
|
"loss": 0.2517, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.745221027479092, |
|
"grad_norm": 0.37880769907144246, |
|
"learning_rate": 3.7012605417721534e-05, |
|
"loss": 0.2908, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 0.7467144563918757, |
|
"grad_norm": 0.4030754847132898, |
|
"learning_rate": 3.660854010536055e-05, |
|
"loss": 0.2634, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.7482078853046595, |
|
"grad_norm": 0.32788742027641565, |
|
"learning_rate": 3.620619773739097e-05, |
|
"loss": 0.2183, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 0.7497013142174432, |
|
"grad_norm": 0.3263923214186051, |
|
"learning_rate": 3.580558924925305e-05, |
|
"loss": 0.2751, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.751194743130227, |
|
"grad_norm": 0.450636347100308, |
|
"learning_rate": 3.540672552926122e-05, |
|
"loss": 0.2625, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 0.7526881720430108, |
|
"grad_norm": 0.33757741246501627, |
|
"learning_rate": 3.500961741830821e-05, |
|
"loss": 0.2575, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.7541816009557945, |
|
"grad_norm": 0.4249647628134897, |
|
"learning_rate": 3.4614275709570033e-05, |
|
"loss": 0.2619, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 0.7556750298685783, |
|
"grad_norm": 0.37310284633404023, |
|
"learning_rate": 3.422071114821304e-05, |
|
"loss": 0.2466, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.757168458781362, |
|
"grad_norm": 0.3827899754640218, |
|
"learning_rate": 3.382893443110175e-05, |
|
"loss": 0.2808, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 0.7586618876941458, |
|
"grad_norm": 0.34434815179683986, |
|
"learning_rate": 3.343895620650793e-05, |
|
"loss": 0.2369, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.7601553166069295, |
|
"grad_norm": 0.38063545879124566, |
|
"learning_rate": 3.305078707382138e-05, |
|
"loss": 0.277, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 0.7616487455197133, |
|
"grad_norm": 0.39371915129642, |
|
"learning_rate": 3.26644375832619e-05, |
|
"loss": 0.2721, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.763142174432497, |
|
"grad_norm": 0.4068353272903545, |
|
"learning_rate": 3.2279918235592346e-05, |
|
"loss": 0.23, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 0.7646356033452808, |
|
"grad_norm": 0.40612317566802253, |
|
"learning_rate": 3.189723948183322e-05, |
|
"loss": 0.2665, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.7661290322580645, |
|
"grad_norm": 0.36414950817333475, |
|
"learning_rate": 3.151641172297891e-05, |
|
"loss": 0.2534, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 0.7676224611708483, |
|
"grad_norm": 0.36001677147408806, |
|
"learning_rate": 3.1137445309714705e-05, |
|
"loss": 0.2537, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.769115890083632, |
|
"grad_norm": 0.4064294097626817, |
|
"learning_rate": 3.076035054213548e-05, |
|
"loss": 0.2628, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 0.7706093189964157, |
|
"grad_norm": 0.38634424971456655, |
|
"learning_rate": 3.0385137669465967e-05, |
|
"loss": 0.2475, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.7721027479091995, |
|
"grad_norm": 0.3868359752050659, |
|
"learning_rate": 3.001181688978203e-05, |
|
"loss": 0.257, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 0.7735961768219832, |
|
"grad_norm": 0.31847898193899454, |
|
"learning_rate": 2.9640398349733334e-05, |
|
"loss": 0.243, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.775089605734767, |
|
"grad_norm": 0.4155060821736252, |
|
"learning_rate": 2.9270892144267993e-05, |
|
"loss": 0.2739, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 0.7765830346475507, |
|
"grad_norm": 0.36416964732293394, |
|
"learning_rate": 2.890330831635778e-05, |
|
"loss": 0.2694, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.7765830346475507, |
|
"eval_loss": 0.24662606418132782, |
|
"eval_runtime": 671.3061, |
|
"eval_samples_per_second": 5.959, |
|
"eval_steps_per_second": 0.745, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.7780764635603346, |
|
"grad_norm": 0.34102575217523634, |
|
"learning_rate": 2.853765685672528e-05, |
|
"loss": 0.2345, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 0.7795698924731183, |
|
"grad_norm": 0.3812471975597926, |
|
"learning_rate": 2.8173947703572546e-05, |
|
"loss": 0.2619, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.7810633213859021, |
|
"grad_norm": 0.4709017717954452, |
|
"learning_rate": 2.7812190742310707e-05, |
|
"loss": 0.2359, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 0.7825567502986858, |
|
"grad_norm": 0.3815586898993409, |
|
"learning_rate": 2.7452395805291463e-05, |
|
"loss": 0.2485, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.7840501792114696, |
|
"grad_norm": 0.44121044789192165, |
|
"learning_rate": 2.7094572671539765e-05, |
|
"loss": 0.2615, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 0.7855436081242533, |
|
"grad_norm": 0.35382301353141093, |
|
"learning_rate": 2.6738731066488075e-05, |
|
"loss": 0.2154, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.7870370370370371, |
|
"grad_norm": 0.33814033552150285, |
|
"learning_rate": 2.638488066171201e-05, |
|
"loss": 0.2569, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 0.7885304659498208, |
|
"grad_norm": 0.37849934442478683, |
|
"learning_rate": 2.603303107466747e-05, |
|
"loss": 0.2616, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.7900238948626045, |
|
"grad_norm": 0.3904880202778289, |
|
"learning_rate": 2.5683191868429247e-05, |
|
"loss": 0.2587, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 0.7915173237753883, |
|
"grad_norm": 0.3646861718649452, |
|
"learning_rate": 2.5335372551431125e-05, |
|
"loss": 0.2766, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.793010752688172, |
|
"grad_norm": 0.38220642098483965, |
|
"learning_rate": 2.4989582577207417e-05, |
|
"loss": 0.2506, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 0.7945041816009558, |
|
"grad_norm": 0.34734076514892087, |
|
"learning_rate": 2.4645831344136037e-05, |
|
"loss": 0.2676, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.7959976105137395, |
|
"grad_norm": 0.33364705048661314, |
|
"learning_rate": 2.4304128195183086e-05, |
|
"loss": 0.2144, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 0.7974910394265233, |
|
"grad_norm": 0.41830275352951174, |
|
"learning_rate": 2.396448241764877e-05, |
|
"loss": 0.281, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.798984468339307, |
|
"grad_norm": 0.4095216224292848, |
|
"learning_rate": 2.3626903242915267e-05, |
|
"loss": 0.2855, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 0.8004778972520908, |
|
"grad_norm": 0.3532672344479507, |
|
"learning_rate": 2.329139984619553e-05, |
|
"loss": 0.2391, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.8019713261648745, |
|
"grad_norm": 0.3758073226768294, |
|
"learning_rate": 2.295798134628404e-05, |
|
"loss": 0.2929, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 0.8034647550776584, |
|
"grad_norm": 0.35018181270060134, |
|
"learning_rate": 2.2626656805309e-05, |
|
"loss": 0.2357, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.804958183990442, |
|
"grad_norm": 0.4220541464364436, |
|
"learning_rate": 2.2297435228485918e-05, |
|
"loss": 0.246, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 0.8064516129032258, |
|
"grad_norm": 0.3870739284799057, |
|
"learning_rate": 2.197032556387295e-05, |
|
"loss": 0.2766, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.8079450418160096, |
|
"grad_norm": 0.41053303775154976, |
|
"learning_rate": 2.164533670212766e-05, |
|
"loss": 0.2407, |
|
"step": 2705 |
|
}, |
|
{ |
|
"epoch": 0.8094384707287933, |
|
"grad_norm": 0.3731772239188146, |
|
"learning_rate": 2.1322477476265367e-05, |
|
"loss": 0.2864, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.8109318996415771, |
|
"grad_norm": 0.37634222348253304, |
|
"learning_rate": 2.1001756661419093e-05, |
|
"loss": 0.2424, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 0.8124253285543608, |
|
"grad_norm": 0.3644559156551827, |
|
"learning_rate": 2.068318297460102e-05, |
|
"loss": 0.2839, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.8139187574671446, |
|
"grad_norm": 0.39313980741067645, |
|
"learning_rate": 2.036676507446562e-05, |
|
"loss": 0.2642, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 0.8154121863799283, |
|
"grad_norm": 0.3850789535231269, |
|
"learning_rate": 2.005251156107426e-05, |
|
"loss": 0.2104, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.8169056152927121, |
|
"grad_norm": 0.39312531551082797, |
|
"learning_rate": 1.9740430975661528e-05, |
|
"loss": 0.264, |
|
"step": 2735 |
|
}, |
|
{ |
|
"epoch": 0.8183990442054958, |
|
"grad_norm": 0.36295621136819534, |
|
"learning_rate": 1.943053180040302e-05, |
|
"loss": 0.2307, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.8198924731182796, |
|
"grad_norm": 0.3725954567916029, |
|
"learning_rate": 1.9122822458184807e-05, |
|
"loss": 0.2479, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 0.8213859020310633, |
|
"grad_norm": 0.3713829913873105, |
|
"learning_rate": 1.8817311312374564e-05, |
|
"loss": 0.2575, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.822879330943847, |
|
"grad_norm": 0.3202055385441327, |
|
"learning_rate": 1.8514006666594197e-05, |
|
"loss": 0.2308, |
|
"step": 2755 |
|
}, |
|
{ |
|
"epoch": 0.8243727598566308, |
|
"grad_norm": 0.3833958130308794, |
|
"learning_rate": 1.821291676449417e-05, |
|
"loss": 0.2713, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.8258661887694145, |
|
"grad_norm": 0.37070939699033145, |
|
"learning_rate": 1.7914049789529465e-05, |
|
"loss": 0.2253, |
|
"step": 2765 |
|
}, |
|
{ |
|
"epoch": 0.8273596176821983, |
|
"grad_norm": 0.3091049252789486, |
|
"learning_rate": 1.7617413864737153e-05, |
|
"loss": 0.2491, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.828853046594982, |
|
"grad_norm": 0.3863395305062906, |
|
"learning_rate": 1.73230170525156e-05, |
|
"loss": 0.3097, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 0.8303464755077659, |
|
"grad_norm": 0.3398351066167249, |
|
"learning_rate": 1.7030867354405354e-05, |
|
"loss": 0.2175, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.8318399044205496, |
|
"grad_norm": 0.353170388342167, |
|
"learning_rate": 1.674097271087165e-05, |
|
"loss": 0.2526, |
|
"step": 2785 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"grad_norm": 0.37416682070994395, |
|
"learning_rate": 1.6453341001088572e-05, |
|
"loss": 0.2155, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.8348267622461171, |
|
"grad_norm": 0.38759381091463324, |
|
"learning_rate": 1.616798004272506e-05, |
|
"loss": 0.2516, |
|
"step": 2795 |
|
}, |
|
{ |
|
"epoch": 0.8363201911589009, |
|
"grad_norm": 0.3774042044993102, |
|
"learning_rate": 1.5884897591732127e-05, |
|
"loss": 0.2639, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.8363201911589009, |
|
"eval_loss": 0.24498099088668823, |
|
"eval_runtime": 670.7113, |
|
"eval_samples_per_second": 5.964, |
|
"eval_steps_per_second": 0.745, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.8378136200716846, |
|
"grad_norm": 0.47282083906303013, |
|
"learning_rate": 1.5604101342132306e-05, |
|
"loss": 0.1951, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 0.8393070489844683, |
|
"grad_norm": 0.3464605159651398, |
|
"learning_rate": 1.5325598925810548e-05, |
|
"loss": 0.265, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.8408004778972521, |
|
"grad_norm": 0.4146312634378869, |
|
"learning_rate": 1.5049397912306518e-05, |
|
"loss": 0.2574, |
|
"step": 2815 |
|
}, |
|
{ |
|
"epoch": 0.8422939068100358, |
|
"grad_norm": 0.40351451569319263, |
|
"learning_rate": 1.4775505808609191e-05, |
|
"loss": 0.2733, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.8437873357228196, |
|
"grad_norm": 0.3716555969690491, |
|
"learning_rate": 1.4503930058952586e-05, |
|
"loss": 0.277, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 0.8452807646356033, |
|
"grad_norm": 0.34726179203272095, |
|
"learning_rate": 1.423467804461357e-05, |
|
"loss": 0.2247, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.8467741935483871, |
|
"grad_norm": 0.3357554714579106, |
|
"learning_rate": 1.396775708371113e-05, |
|
"loss": 0.2656, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 0.8482676224611708, |
|
"grad_norm": 0.3575152949170555, |
|
"learning_rate": 1.3703174431007559e-05, |
|
"loss": 0.2352, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.8497610513739546, |
|
"grad_norm": 0.3501080536067048, |
|
"learning_rate": 1.344093727771124e-05, |
|
"loss": 0.2406, |
|
"step": 2845 |
|
}, |
|
{ |
|
"epoch": 0.8512544802867383, |
|
"grad_norm": 0.5043952724631786, |
|
"learning_rate": 1.3181052751281197e-05, |
|
"loss": 0.2768, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.8527479091995221, |
|
"grad_norm": 0.3575895422258797, |
|
"learning_rate": 1.2923527915233336e-05, |
|
"loss": 0.2125, |
|
"step": 2855 |
|
}, |
|
{ |
|
"epoch": 0.8542413381123058, |
|
"grad_norm": 0.3407245335177286, |
|
"learning_rate": 1.2668369768948608e-05, |
|
"loss": 0.2661, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.8557347670250897, |
|
"grad_norm": 0.4458153062323654, |
|
"learning_rate": 1.2415585247482498e-05, |
|
"loss": 0.2226, |
|
"step": 2865 |
|
}, |
|
{ |
|
"epoch": 0.8572281959378734, |
|
"grad_norm": 0.4369000451331262, |
|
"learning_rate": 1.2165181221376787e-05, |
|
"loss": 0.272, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.8587216248506571, |
|
"grad_norm": 0.48118794169539114, |
|
"learning_rate": 1.1917164496472799e-05, |
|
"loss": 0.2616, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 0.8602150537634409, |
|
"grad_norm": 0.3744622943326254, |
|
"learning_rate": 1.1671541813726194e-05, |
|
"loss": 0.264, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.8617084826762246, |
|
"grad_norm": 0.4536685590862434, |
|
"learning_rate": 1.1428319849023984e-05, |
|
"loss": 0.2557, |
|
"step": 2885 |
|
}, |
|
{ |
|
"epoch": 0.8632019115890084, |
|
"grad_norm": 0.34146777737976525, |
|
"learning_rate": 1.1187505213003068e-05, |
|
"loss": 0.2228, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.8646953405017921, |
|
"grad_norm": 0.4000563885902401, |
|
"learning_rate": 1.0949104450870384e-05, |
|
"loss": 0.2672, |
|
"step": 2895 |
|
}, |
|
{ |
|
"epoch": 0.8661887694145759, |
|
"grad_norm": 0.380767424516121, |
|
"learning_rate": 1.0713124042225165e-05, |
|
"loss": 0.2704, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.8676821983273596, |
|
"grad_norm": 0.3742084189949366, |
|
"learning_rate": 1.0479570400882888e-05, |
|
"loss": 0.2166, |
|
"step": 2905 |
|
}, |
|
{ |
|
"epoch": 0.8691756272401434, |
|
"grad_norm": 0.4568051868923601, |
|
"learning_rate": 1.0248449874700705e-05, |
|
"loss": 0.2639, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.8706690561529271, |
|
"grad_norm": 0.3959326669620433, |
|
"learning_rate": 1.0019768745405122e-05, |
|
"loss": 0.224, |
|
"step": 2915 |
|
}, |
|
{ |
|
"epoch": 0.8721624850657109, |
|
"grad_norm": 0.3843360418655597, |
|
"learning_rate": 9.79353322842127e-06, |
|
"loss": 0.2596, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.8736559139784946, |
|
"grad_norm": 0.3404605327204112, |
|
"learning_rate": 9.569749472703816e-06, |
|
"loss": 0.2532, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 0.8751493428912783, |
|
"grad_norm": 0.4023542619848836, |
|
"learning_rate": 9.34842356056993e-06, |
|
"loss": 0.2461, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.8766427718040621, |
|
"grad_norm": 0.34874999168812615, |
|
"learning_rate": 9.129561507534046e-06, |
|
"loss": 0.2625, |
|
"step": 2935 |
|
}, |
|
{ |
|
"epoch": 0.8781362007168458, |
|
"grad_norm": 0.363113367835939, |
|
"learning_rate": 8.913169262144239e-06, |
|
"loss": 0.2298, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.8796296296296297, |
|
"grad_norm": 0.4366858825613128, |
|
"learning_rate": 8.699252705820526e-06, |
|
"loss": 0.266, |
|
"step": 2945 |
|
}, |
|
{ |
|
"epoch": 0.8811230585424134, |
|
"grad_norm": 0.3631698452397834, |
|
"learning_rate": 8.487817652695229e-06, |
|
"loss": 0.2884, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.8826164874551972, |
|
"grad_norm": 0.3576538561239022, |
|
"learning_rate": 8.278869849454718e-06, |
|
"loss": 0.242, |
|
"step": 2955 |
|
}, |
|
{ |
|
"epoch": 0.8841099163679809, |
|
"grad_norm": 0.4230742530870855, |
|
"learning_rate": 8.07241497518324e-06, |
|
"loss": 0.2733, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.8856033452807647, |
|
"grad_norm": 0.4193451166331605, |
|
"learning_rate": 7.868458641208765e-06, |
|
"loss": 0.2314, |
|
"step": 2965 |
|
}, |
|
{ |
|
"epoch": 0.8870967741935484, |
|
"grad_norm": 0.3701152603399041, |
|
"learning_rate": 7.667006390950248e-06, |
|
"loss": 0.2767, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.8885902031063322, |
|
"grad_norm": 0.47090020438598185, |
|
"learning_rate": 7.468063699767081e-06, |
|
"loss": 0.2731, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 0.8900836320191159, |
|
"grad_norm": 0.4420262562340979, |
|
"learning_rate": 7.27163597481022e-06, |
|
"loss": 0.2428, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.8915770609318996, |
|
"grad_norm": 0.4555285504311469, |
|
"learning_rate": 7.077728554875263e-06, |
|
"loss": 0.2637, |
|
"step": 2985 |
|
}, |
|
{ |
|
"epoch": 0.8930704898446834, |
|
"grad_norm": 0.32544659259467257, |
|
"learning_rate": 6.886346710257363e-06, |
|
"loss": 0.2332, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.8945639187574671, |
|
"grad_norm": 0.39917120432545034, |
|
"learning_rate": 6.697495642607854e-06, |
|
"loss": 0.2774, |
|
"step": 2995 |
|
}, |
|
{ |
|
"epoch": 0.8960573476702509, |
|
"grad_norm": 0.4322326823522948, |
|
"learning_rate": 6.511180484793067e-06, |
|
"loss": 0.2598, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.8960573476702509, |
|
"eval_loss": 0.24354791641235352, |
|
"eval_runtime": 670.7371, |
|
"eval_samples_per_second": 5.964, |
|
"eval_steps_per_second": 0.745, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.8975507765830346, |
|
"grad_norm": 0.41924813736657407, |
|
"learning_rate": 6.3274063007547125e-06, |
|
"loss": 0.249, |
|
"step": 3005 |
|
}, |
|
{ |
|
"epoch": 0.8990442054958184, |
|
"grad_norm": 0.31419280668927857, |
|
"learning_rate": 6.146178085372156e-06, |
|
"loss": 0.2753, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.9005376344086021, |
|
"grad_norm": 0.36068884644950366, |
|
"learning_rate": 5.9675007643269054e-06, |
|
"loss": 0.2412, |
|
"step": 3015 |
|
}, |
|
{ |
|
"epoch": 0.9020310633213859, |
|
"grad_norm": 0.3391790680684511, |
|
"learning_rate": 5.791379193968505e-06, |
|
"loss": 0.2583, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.9035244922341696, |
|
"grad_norm": 0.362307062919707, |
|
"learning_rate": 5.617818161182586e-06, |
|
"loss": 0.2867, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 0.9050179211469535, |
|
"grad_norm": 0.38255580305837367, |
|
"learning_rate": 5.446822383260908e-06, |
|
"loss": 0.2392, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.9065113500597372, |
|
"grad_norm": 0.41055570601524083, |
|
"learning_rate": 5.2783965077729755e-06, |
|
"loss": 0.2414, |
|
"step": 3035 |
|
}, |
|
{ |
|
"epoch": 0.9080047789725209, |
|
"grad_norm": 0.4885743038657098, |
|
"learning_rate": 5.112545112439782e-06, |
|
"loss": 0.2534, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.9094982078853047, |
|
"grad_norm": 0.42278986461993495, |
|
"learning_rate": 4.949272705009411e-06, |
|
"loss": 0.2591, |
|
"step": 3045 |
|
}, |
|
{ |
|
"epoch": 0.9109916367980884, |
|
"grad_norm": 0.4025383892648368, |
|
"learning_rate": 4.788583723134498e-06, |
|
"loss": 0.2887, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.9124850657108722, |
|
"grad_norm": 0.42940556785118494, |
|
"learning_rate": 4.630482534251601e-06, |
|
"loss": 0.2239, |
|
"step": 3055 |
|
}, |
|
{ |
|
"epoch": 0.9139784946236559, |
|
"grad_norm": 0.4252373447840496, |
|
"learning_rate": 4.474973435462526e-06, |
|
"loss": 0.2648, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.9154719235364397, |
|
"grad_norm": 0.34717731405681657, |
|
"learning_rate": 4.322060653417525e-06, |
|
"loss": 0.2359, |
|
"step": 3065 |
|
}, |
|
{ |
|
"epoch": 0.9169653524492234, |
|
"grad_norm": 0.3932159130540263, |
|
"learning_rate": 4.171748344200399e-06, |
|
"loss": 0.2844, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.9184587813620072, |
|
"grad_norm": 0.3818285602145142, |
|
"learning_rate": 4.0240405932155725e-06, |
|
"loss": 0.2601, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 0.9199522102747909, |
|
"grad_norm": 0.3967329457656309, |
|
"learning_rate": 3.87894141507702e-06, |
|
"loss": 0.2485, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.9214456391875747, |
|
"grad_norm": 0.3646492393972917, |
|
"learning_rate": 3.7364547534991745e-06, |
|
"loss": 0.2921, |
|
"step": 3085 |
|
}, |
|
{ |
|
"epoch": 0.9229390681003584, |
|
"grad_norm": 0.31319503804493787, |
|
"learning_rate": 3.5965844811897243e-06, |
|
"loss": 0.2294, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.9244324970131422, |
|
"grad_norm": 0.36230176443854195, |
|
"learning_rate": 3.459334399744374e-06, |
|
"loss": 0.2616, |
|
"step": 3095 |
|
}, |
|
{ |
|
"epoch": 0.9259259259259259, |
|
"grad_norm": 0.46214491992499196, |
|
"learning_rate": 3.3247082395434835e-06, |
|
"loss": 0.2821, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.9274193548387096, |
|
"grad_norm": 0.32464884452568865, |
|
"learning_rate": 3.19270965965075e-06, |
|
"loss": 0.2249, |
|
"step": 3105 |
|
}, |
|
{ |
|
"epoch": 0.9289127837514934, |
|
"grad_norm": 0.38961006991574, |
|
"learning_rate": 3.063342247713652e-06, |
|
"loss": 0.2814, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.9304062126642771, |
|
"grad_norm": 0.3430373709286391, |
|
"learning_rate": 2.9366095198660292e-06, |
|
"loss": 0.2424, |
|
"step": 3115 |
|
}, |
|
{ |
|
"epoch": 0.931899641577061, |
|
"grad_norm": 0.3754679213082568, |
|
"learning_rate": 2.812514920632481e-06, |
|
"loss": 0.2458, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.9333930704898447, |
|
"grad_norm": 0.32857648616715374, |
|
"learning_rate": 2.6910618228347397e-06, |
|
"loss": 0.2648, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 0.9348864994026285, |
|
"grad_norm": 0.38704797315639966, |
|
"learning_rate": 2.5722535275000014e-06, |
|
"loss": 0.2576, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.9363799283154122, |
|
"grad_norm": 0.347816629229733, |
|
"learning_rate": 2.456093263771242e-06, |
|
"loss": 0.2579, |
|
"step": 3135 |
|
}, |
|
{ |
|
"epoch": 0.937873357228196, |
|
"grad_norm": 0.3285229980093725, |
|
"learning_rate": 2.3425841888193744e-06, |
|
"loss": 0.226, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.9393667861409797, |
|
"grad_norm": 0.35095116254795605, |
|
"learning_rate": 2.2317293877575195e-06, |
|
"loss": 0.2715, |
|
"step": 3145 |
|
}, |
|
{ |
|
"epoch": 0.9408602150537635, |
|
"grad_norm": 0.43133915586490196, |
|
"learning_rate": 2.1235318735571164e-06, |
|
"loss": 0.2743, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.9423536439665472, |
|
"grad_norm": 0.39808144981035765, |
|
"learning_rate": 2.0179945869660098e-06, |
|
"loss": 0.2133, |
|
"step": 3155 |
|
}, |
|
{ |
|
"epoch": 0.9438470728793309, |
|
"grad_norm": 0.37858205437619763, |
|
"learning_rate": 1.9151203964285936e-06, |
|
"loss": 0.2744, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.9453405017921147, |
|
"grad_norm": 0.33007798696691204, |
|
"learning_rate": 1.8149120980077595e-06, |
|
"loss": 0.2364, |
|
"step": 3165 |
|
}, |
|
{ |
|
"epoch": 0.9468339307048984, |
|
"grad_norm": 0.40641752996751496, |
|
"learning_rate": 1.7173724153089598e-06, |
|
"loss": 0.2546, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 0.9483273596176822, |
|
"grad_norm": 0.41533410990816155, |
|
"learning_rate": 1.6225039994061552e-06, |
|
"loss": 0.2551, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 0.9498207885304659, |
|
"grad_norm": 0.31219456691145986, |
|
"learning_rate": 1.5303094287697938e-06, |
|
"loss": 0.2152, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.9513142174432497, |
|
"grad_norm": 0.394461459007535, |
|
"learning_rate": 1.4407912091966902e-06, |
|
"loss": 0.2989, |
|
"step": 3185 |
|
}, |
|
{ |
|
"epoch": 0.9528076463560334, |
|
"grad_norm": 0.3577386265843814, |
|
"learning_rate": 1.353951773741935e-06, |
|
"loss": 0.2229, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 0.9543010752688172, |
|
"grad_norm": 0.3868747591156057, |
|
"learning_rate": 1.2697934826527701e-06, |
|
"loss": 0.249, |
|
"step": 3195 |
|
}, |
|
{ |
|
"epoch": 0.955794504181601, |
|
"grad_norm": 0.3780574063379163, |
|
"learning_rate": 1.1883186233044186e-06, |
|
"loss": 0.2483, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.955794504181601, |
|
"eval_loss": 0.2432253062725067, |
|
"eval_runtime": 671.4179, |
|
"eval_samples_per_second": 5.958, |
|
"eval_steps_per_second": 0.745, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.9572879330943848, |
|
"grad_norm": 0.3892026722877345, |
|
"learning_rate": 1.109529410137966e-06, |
|
"loss": 0.2305, |
|
"step": 3205 |
|
}, |
|
{ |
|
"epoch": 0.9587813620071685, |
|
"grad_norm": 0.3931487469870653, |
|
"learning_rate": 1.0334279846001106e-06, |
|
"loss": 0.2544, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.9602747909199522, |
|
"grad_norm": 0.36069640184936463, |
|
"learning_rate": 9.600164150849854e-07, |
|
"loss": 0.2575, |
|
"step": 3215 |
|
}, |
|
{ |
|
"epoch": 0.961768219832736, |
|
"grad_norm": 0.44849903301516003, |
|
"learning_rate": 8.892966968779615e-07, |
|
"loss": 0.2503, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.9632616487455197, |
|
"grad_norm": 0.411995160828815, |
|
"learning_rate": 8.212707521013774e-07, |
|
"loss": 0.2612, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 0.9647550776583035, |
|
"grad_norm": 0.5604597904188017, |
|
"learning_rate": 7.559404296623495e-07, |
|
"loss": 0.2106, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 0.9662485065710872, |
|
"grad_norm": 0.3789859551503982, |
|
"learning_rate": 6.933075052024562e-07, |
|
"loss": 0.2597, |
|
"step": 3235 |
|
}, |
|
{ |
|
"epoch": 0.967741935483871, |
|
"grad_norm": 0.3029358775615665, |
|
"learning_rate": 6.333736810495095e-07, |
|
"loss": 0.22, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.9692353643966547, |
|
"grad_norm": 0.43863865163510735, |
|
"learning_rate": 5.761405861713142e-07, |
|
"loss": 0.2888, |
|
"step": 3245 |
|
}, |
|
{ |
|
"epoch": 0.9707287933094385, |
|
"grad_norm": 0.3410374697833037, |
|
"learning_rate": 5.216097761313377e-07, |
|
"loss": 0.2615, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.9722222222222222, |
|
"grad_norm": 0.3972032404026882, |
|
"learning_rate": 4.6978273304646434e-07, |
|
"loss": 0.2084, |
|
"step": 3255 |
|
}, |
|
{ |
|
"epoch": 0.973715651135006, |
|
"grad_norm": 0.43978255054644894, |
|
"learning_rate": 4.206608655467403e-07, |
|
"loss": 0.2626, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.9752090800477897, |
|
"grad_norm": 0.4054251372400272, |
|
"learning_rate": 3.7424550873699227e-07, |
|
"loss": 0.2104, |
|
"step": 3265 |
|
}, |
|
{ |
|
"epoch": 0.9767025089605734, |
|
"grad_norm": 0.3250043163561357, |
|
"learning_rate": 3.305379241606343e-07, |
|
"loss": 0.2575, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.9781959378733572, |
|
"grad_norm": 0.34288193953849005, |
|
"learning_rate": 2.8953929976536233e-07, |
|
"loss": 0.2521, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 0.9796893667861409, |
|
"grad_norm": 0.3438606265427078, |
|
"learning_rate": 2.51250749870835e-07, |
|
"loss": 0.2223, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.9811827956989247, |
|
"grad_norm": 0.45991832962727547, |
|
"learning_rate": 2.156733151383872e-07, |
|
"loss": 0.2615, |
|
"step": 3285 |
|
}, |
|
{ |
|
"epoch": 0.9826762246117084, |
|
"grad_norm": 0.34236518853477144, |
|
"learning_rate": 1.8280796254279698e-07, |
|
"loss": 0.2332, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 0.9841696535244923, |
|
"grad_norm": 0.3358771867133346, |
|
"learning_rate": 1.5265558534592883e-07, |
|
"loss": 0.2436, |
|
"step": 3295 |
|
}, |
|
{ |
|
"epoch": 0.985663082437276, |
|
"grad_norm": 0.393065156149057, |
|
"learning_rate": 1.252170030725308e-07, |
|
"loss": 0.2707, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.9871565113500598, |
|
"grad_norm": 0.3901577896476038, |
|
"learning_rate": 1.0049296148790799e-07, |
|
"loss": 0.2241, |
|
"step": 3305 |
|
}, |
|
{ |
|
"epoch": 0.9886499402628435, |
|
"grad_norm": 0.39775264512368114, |
|
"learning_rate": 7.848413257766085e-08, |
|
"loss": 0.2768, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 0.9901433691756273, |
|
"grad_norm": 0.4277804217503546, |
|
"learning_rate": 5.9191114529433266e-08, |
|
"loss": 0.2351, |
|
"step": 3315 |
|
}, |
|
{ |
|
"epoch": 0.991636798088411, |
|
"grad_norm": 0.326337320725509, |
|
"learning_rate": 4.26144317166699e-08, |
|
"loss": 0.2526, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.9931302270011948, |
|
"grad_norm": 0.4455257895279916, |
|
"learning_rate": 2.8754534684316547e-08, |
|
"loss": 0.267, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 0.9946236559139785, |
|
"grad_norm": 0.3562424876586232, |
|
"learning_rate": 1.7611800136618783e-08, |
|
"loss": 0.2358, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 0.9961170848267622, |
|
"grad_norm": 0.36120057011336665, |
|
"learning_rate": 9.186530926874604e-09, |
|
"loss": 0.2675, |
|
"step": 3335 |
|
}, |
|
{ |
|
"epoch": 0.997610513739546, |
|
"grad_norm": 0.3421927721985385, |
|
"learning_rate": 3.4789560491743643e-09, |
|
"loss": 0.2207, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.9991039426523297, |
|
"grad_norm": 0.3603187953155133, |
|
"learning_rate": 4.892306322057394e-10, |
|
"loss": 0.2668, |
|
"step": 3345 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 3348, |
|
"total_flos": 1.6414748941746176e+16, |
|
"train_loss": 0.27713136451503567, |
|
"train_runtime": 31532.8493, |
|
"train_samples_per_second": 1.699, |
|
"train_steps_per_second": 0.106 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 3348, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 200, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.6414748941746176e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|