|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.950738916256158, |
|
"eval_steps": 500, |
|
"global_step": 1010, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.009852216748768473, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 1.9801980198019803e-06, |
|
"loss": 2.8604, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.04926108374384237, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 9.900990099009901e-06, |
|
"loss": 2.8185, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.09852216748768473, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 1.9801980198019803e-05, |
|
"loss": 2.8289, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.1477832512315271, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 2.9702970297029702e-05, |
|
"loss": 2.7485, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.19704433497536947, |
|
"grad_norm": 3.3125, |
|
"learning_rate": 3.9603960396039605e-05, |
|
"loss": 2.6773, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.24630541871921183, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 4.950495049504951e-05, |
|
"loss": 2.5348, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.2955665024630542, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 5.9405940594059404e-05, |
|
"loss": 2.375, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.3448275862068966, |
|
"grad_norm": 1.5, |
|
"learning_rate": 6.93069306930693e-05, |
|
"loss": 2.225, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.39408866995073893, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 7.920792079207921e-05, |
|
"loss": 2.0577, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.4433497536945813, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 8.910891089108912e-05, |
|
"loss": 1.9332, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.49261083743842365, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 9.900990099009902e-05, |
|
"loss": 1.8054, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.541871921182266, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 0.00010891089108910893, |
|
"loss": 1.7066, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.5911330049261084, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.00011881188118811881, |
|
"loss": 1.6055, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.6403940886699507, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 0.00012871287128712872, |
|
"loss": 1.5102, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.6896551724137931, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 0.0001386138613861386, |
|
"loss": 1.4332, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.7389162561576355, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 0.0001485148514851485, |
|
"loss": 1.3703, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.7881773399014779, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 0.00015841584158415842, |
|
"loss": 1.3483, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.8374384236453202, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.00016831683168316833, |
|
"loss": 1.2989, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.8866995073891626, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.00017821782178217824, |
|
"loss": 1.2627, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.9359605911330049, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.00018811881188118812, |
|
"loss": 1.2456, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.9852216748768473, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.00019801980198019803, |
|
"loss": 1.2361, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.9950738916256158, |
|
"eval_loss": 2.518617868423462, |
|
"eval_runtime": 0.6245, |
|
"eval_samples_per_second": 16.014, |
|
"eval_steps_per_second": 1.601, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 1.0344827586206897, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.000199990444464082, |
|
"loss": 1.2023, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.083743842364532, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.00019995162822919883, |
|
"loss": 1.1906, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.1330049261083743, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 0.00019988296565626987, |
|
"loss": 1.1786, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.1822660098522166, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 0.00019978447724847652, |
|
"loss": 1.1654, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.2315270935960592, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 0.0001996561924152278, |
|
"loss": 1.1533, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.2807881773399015, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 0.00019949814946337838, |
|
"loss": 1.149, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.3300492610837438, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.00019931039558578997, |
|
"loss": 1.1363, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.3793103448275863, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 0.00019909298684723904, |
|
"loss": 1.1292, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.4285714285714286, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.00019884598816767563, |
|
"loss": 1.1319, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.477832512315271, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 0.00019856947330283752, |
|
"loss": 1.1248, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.5270935960591134, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.00019826352482222638, |
|
"loss": 1.1241, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.5763546798029555, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 0.00019792823408445174, |
|
"loss": 1.1179, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.625615763546798, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.00019756370120995066, |
|
"loss": 1.1112, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.6748768472906403, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 0.00019717003505109095, |
|
"loss": 1.1021, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.7241379310344827, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 0.0001967473531596671, |
|
"loss": 1.1162, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.7733990147783252, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 0.0001962957817517982, |
|
"loss": 1.0989, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.8226600985221675, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 0.000195815455670239, |
|
"loss": 1.1004, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.8719211822660098, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.00019530651834411474, |
|
"loss": 1.0986, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.9211822660098523, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 0.0001947691217460921, |
|
"loss": 1.0921, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.9704433497536946, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 0.0001942034263469989, |
|
"loss": 1.0968, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 2.484497547149658, |
|
"eval_runtime": 0.5386, |
|
"eval_samples_per_second": 18.567, |
|
"eval_steps_per_second": 1.857, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 2.019704433497537, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.00019360960106790643, |
|
"loss": 1.0792, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 2.0689655172413794, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 0.00019298782322968815, |
|
"loss": 1.0645, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.1182266009852215, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 0.00019233827850007027, |
|
"loss": 1.069, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 2.167487684729064, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 0.00019166116083819002, |
|
"loss": 1.0589, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.2167487684729066, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.0001909566724366779, |
|
"loss": 1.0646, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 2.2660098522167487, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.00019022502366128135, |
|
"loss": 1.0631, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.315270935960591, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.00018946643298804793, |
|
"loss": 1.0559, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 2.3645320197044333, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.00018868112693808665, |
|
"loss": 1.0469, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.413793103448276, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.00018786934000992688, |
|
"loss": 1.0502, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 2.4630541871921183, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.00018703131460949554, |
|
"loss": 1.058, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.512315270935961, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.0001861673009777325, |
|
"loss": 1.0501, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 2.561576354679803, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.00018527755711586678, |
|
"loss": 1.0516, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.6108374384236455, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.00018436234870837547, |
|
"loss": 1.0503, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 2.6600985221674875, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.00018342194904364813, |
|
"loss": 1.0539, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.70935960591133, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.00018245663893238075, |
|
"loss": 1.0407, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 2.7586206896551726, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 0.00018146670662372354, |
|
"loss": 1.0412, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.8078817733990147, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.0001804524477192075, |
|
"loss": 1.0412, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 0.00017941416508447536, |
|
"loss": 1.0286, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.9064039408866993, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.00017835216875884368, |
|
"loss": 1.0476, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 2.955665024630542, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.00017726677586272263, |
|
"loss": 1.0436, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.9950738916256157, |
|
"eval_loss": 2.479555606842041, |
|
"eval_runtime": 0.6669, |
|
"eval_samples_per_second": 14.995, |
|
"eval_steps_per_second": 1.5, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 3.0049261083743843, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 0.0001761583105029213, |
|
"loss": 1.0275, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 3.0541871921182264, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.00017502710367586687, |
|
"loss": 1.0062, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 3.103448275862069, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.00017387349316876666, |
|
"loss": 1.018, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 3.1527093596059115, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.00017269782345874203, |
|
"loss": 1.005, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 3.2019704433497536, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.00017150044560996488, |
|
"loss": 1.0104, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 3.251231527093596, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.00017028171716882714, |
|
"loss": 1.0039, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 3.3004926108374386, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 0.0001690420020571747, |
|
"loss": 1.0177, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 3.3497536945812807, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 0.00016778167046363734, |
|
"loss": 1.0066, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 3.399014778325123, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.00016650109873308765, |
|
"loss": 1.0187, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 3.4482758620689653, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 0.00016520066925426144, |
|
"loss": 1.0157, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 3.497536945812808, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.00016388077034557355, |
|
"loss": 1.0104, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 3.5467980295566504, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.00016254179613916278, |
|
"loss": 1.0177, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 3.596059113300493, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 0.0001611841464632011, |
|
"loss": 1.0193, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 3.645320197044335, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.0001598082267225018, |
|
"loss": 1.0096, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 3.6945812807881775, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.0001584144477774623, |
|
"loss": 1.025, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 3.7438423645320196, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.00015700322582137827, |
|
"loss": 1.0125, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 3.793103448275862, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.00015557498225616487, |
|
"loss": 1.0022, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 3.8423645320197046, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.00015413014356652286, |
|
"loss": 1.007, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 3.8916256157635467, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.000152669141192587, |
|
"loss": 1.0013, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 3.9408866995073892, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.00015119241140109467, |
|
"loss": 1.0009, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 3.9901477832512313, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.00014970039515511304, |
|
"loss": 1.0084, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 2.494363307952881, |
|
"eval_runtime": 0.5386, |
|
"eval_samples_per_second": 18.567, |
|
"eval_steps_per_second": 1.857, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 4.039408866995074, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.00014819353798236427, |
|
"loss": 0.9801, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 4.088669950738916, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.0001466722898421873, |
|
"loss": 0.9817, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 4.137931034482759, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.00014513710499117647, |
|
"loss": 0.988, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 4.187192118226601, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.00014358844184753712, |
|
"loss": 0.9782, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 4.236453201970443, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.00014202676285419812, |
|
"loss": 0.9812, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 4.285714285714286, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 0.0001404525343407228, |
|
"loss": 0.9897, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 4.334975369458128, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 0.00013886622638405952, |
|
"loss": 0.992, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 4.384236453201971, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.00013726831266817278, |
|
"loss": 0.9933, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 4.433497536945813, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.0001356592703425976, |
|
"loss": 0.9742, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 4.482758620689655, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.00013403957987995882, |
|
"loss": 0.9777, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 4.532019704433497, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.00013240972493249847, |
|
"loss": 0.9853, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 4.58128078817734, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 0.00013077019218765305, |
|
"loss": 0.9791, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 4.630541871921182, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.00012912147122272523, |
|
"loss": 0.9857, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 4.679802955665025, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 0.00012746405435869198, |
|
"loss": 0.9854, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 4.7290640394088665, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 0.0001257984365131938, |
|
"loss": 0.9836, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 4.778325123152709, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 0.00012412511505274844, |
|
"loss": 0.9939, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 4.827586206896552, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 0.00012244458964423327, |
|
"loss": 0.9685, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 4.876847290640394, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.0001207573621056809, |
|
"loss": 0.9806, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 4.926108374384237, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 0.00011906393625643244, |
|
"loss": 0.979, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 4.975369458128079, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.00011736481776669306, |
|
"loss": 0.9913, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 4.995073891625616, |
|
"eval_loss": 2.50097918510437, |
|
"eval_runtime": 0.6794, |
|
"eval_samples_per_second": 14.718, |
|
"eval_steps_per_second": 1.472, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 5.024630541871921, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.00011566051400653486, |
|
"loss": 0.9714, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 5.073891625615763, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 0.00011395153389439233, |
|
"loss": 0.9602, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 5.123152709359606, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.00011223838774509514, |
|
"loss": 0.9657, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 5.172413793103448, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 0.00011052158711748434, |
|
"loss": 0.9526, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 5.221674876847291, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.00010880164466165674, |
|
"loss": 0.958, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 5.2709359605911335, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 0.00010707907396588361, |
|
"loss": 0.9666, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 5.320197044334975, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.0001053543894032493, |
|
"loss": 0.9625, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 5.369458128078818, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.00010362810597805526, |
|
"loss": 0.9657, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 5.41871921182266, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 0.00010190073917203589, |
|
"loss": 0.9655, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 5.467980295566503, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.00010017280479043147, |
|
"loss": 0.9665, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 5.517241379310345, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 9.844481880796491e-05, |
|
"loss": 0.9587, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 5.566502463054187, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 9.671729721476746e-05, |
|
"loss": 0.9665, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 5.615763546798029, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 9.499075586230013e-05, |
|
"loss": 0.9554, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 5.665024630541872, |
|
"grad_norm": 0.625, |
|
"learning_rate": 9.326571030931637e-05, |
|
"loss": 0.9607, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 5.714285714285714, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 9.154267566791223e-05, |
|
"loss": 0.9669, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 5.763546798029557, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 8.982216644970979e-05, |
|
"loss": 0.9628, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 5.812807881773399, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 8.810469641222001e-05, |
|
"loss": 0.9511, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 5.862068965517241, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 8.639077840543077e-05, |
|
"loss": 0.9684, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 5.911330049261084, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 8.468092421866573e-05, |
|
"loss": 0.9678, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 5.960591133004926, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 8.297564442776014e-05, |
|
"loss": 0.9588, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 2.5066332817077637, |
|
"eval_runtime": 0.5391, |
|
"eval_samples_per_second": 18.548, |
|
"eval_steps_per_second": 1.855, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 6.009852216748769, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 8.127544824259889e-05, |
|
"loss": 0.953, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 6.059113300492611, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 7.958084335506239e-05, |
|
"loss": 0.9536, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 6.108374384236453, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 7.789233578742582e-05, |
|
"loss": 0.9446, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 6.157635467980295, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 7.6210429741257e-05, |
|
"loss": 0.9353, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 6.206896551724138, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 7.453562744685778e-05, |
|
"loss": 0.9449, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 6.25615763546798, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 7.286842901329412e-05, |
|
"loss": 0.9535, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 6.305418719211823, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 7.12093322790597e-05, |
|
"loss": 0.9376, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 6.3546798029556655, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 6.955883266341741e-05, |
|
"loss": 0.9461, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 6.403940886699507, |
|
"grad_norm": 0.5, |
|
"learning_rate": 6.791742301846326e-05, |
|
"loss": 0.9503, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 6.45320197044335, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 6.62855934819569e-05, |
|
"loss": 0.9447, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 6.502463054187192, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 6.466383133096267e-05, |
|
"loss": 0.9453, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 6.551724137931035, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 6.305262083634488e-05, |
|
"loss": 0.944, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 6.600985221674877, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 6.145244311816063e-05, |
|
"loss": 0.9366, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 6.650246305418719, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 5.986377600199371e-05, |
|
"loss": 0.9405, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 6.699507389162561, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 5.828709387627218e-05, |
|
"loss": 0.9521, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 6.748768472906404, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 5.6722867550612116e-05, |
|
"loss": 0.9509, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 6.798029556650246, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 5.5171564115230254e-05, |
|
"loss": 0.9625, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 6.847290640394089, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 5.363364680146725e-05, |
|
"loss": 0.9496, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 6.896551724137931, |
|
"grad_norm": 0.439453125, |
|
"learning_rate": 5.210957484346314e-05, |
|
"loss": 0.9457, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 6.945812807881773, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 5.059980334102637e-05, |
|
"loss": 0.9377, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 6.995073891625616, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 4.9104783123737566e-05, |
|
"loss": 0.9459, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 6.995073891625616, |
|
"eval_loss": 2.516418933868408, |
|
"eval_runtime": 0.674, |
|
"eval_samples_per_second": 14.837, |
|
"eval_steps_per_second": 1.484, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 7.044334975369458, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 4.762496061632814e-05, |
|
"loss": 0.9341, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 7.093596059113301, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 4.6160777705374524e-05, |
|
"loss": 0.938, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 7.142857142857143, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 4.471267160734731e-05, |
|
"loss": 0.9366, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 7.192118226600985, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 4.328107473805487e-05, |
|
"loss": 0.9403, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 7.241379310344827, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 4.1866414583520877e-05, |
|
"loss": 0.9387, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 7.29064039408867, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 4.046911357233343e-05, |
|
"loss": 0.9334, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 7.3399014778325125, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 3.9089588949504655e-05, |
|
"loss": 0.93, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 7.389162561576355, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 3.772825265187802e-05, |
|
"loss": 0.9298, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 7.4384236453201975, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 3.638551118512089e-05, |
|
"loss": 0.9486, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 7.487684729064039, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 3.506176550233863e-05, |
|
"loss": 0.9373, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 7.536945812807882, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 3.3757410884346894e-05, |
|
"loss": 0.939, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 7.586206896551724, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 3.2472836821637744e-05, |
|
"loss": 0.9325, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 7.635467980295567, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 3.120842689807468e-05, |
|
"loss": 0.932, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 7.684729064039409, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 2.996455867635155e-05, |
|
"loss": 0.9259, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 7.733990147783251, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 2.874160358524931e-05, |
|
"loss": 0.9277, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 7.783251231527093, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 2.753992680872457e-05, |
|
"loss": 0.9259, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 7.832512315270936, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 2.6359887176862718e-05, |
|
"loss": 0.9431, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 7.8817733990147785, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 2.5201837058728505e-05, |
|
"loss": 0.9323, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 7.931034482758621, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 2.4066122257145894e-05, |
|
"loss": 0.9294, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 7.980295566502463, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 2.295308190543859e-05, |
|
"loss": 0.943, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 2.523322582244873, |
|
"eval_runtime": 0.5404, |
|
"eval_samples_per_second": 18.504, |
|
"eval_steps_per_second": 1.85, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 8.029556650246306, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 2.1863048366162208e-05, |
|
"loss": 0.9428, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 8.078817733990148, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 2.0796347131858186e-05, |
|
"loss": 0.931, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 8.12807881773399, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 1.9753296727859195e-05, |
|
"loss": 0.9263, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 8.177339901477833, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 1.8734208617174988e-05, |
|
"loss": 0.9426, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 8.226600985221674, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 1.773938710748706e-05, |
|
"loss": 0.9267, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 8.275862068965518, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 1.676912926028007e-05, |
|
"loss": 0.9269, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 8.32512315270936, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 1.5823724802136865e-05, |
|
"loss": 0.9294, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 8.374384236453203, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 1.4903456038223939e-05, |
|
"loss": 0.9222, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 8.423645320197044, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 1.4008597767992871e-05, |
|
"loss": 0.929, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 8.472906403940886, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 1.3139417203123027e-05, |
|
"loss": 0.9285, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 8.52216748768473, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 1.2296173887730123e-05, |
|
"loss": 0.9346, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 8.571428571428571, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 1.1479119620864276e-05, |
|
"loss": 0.9252, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 8.620689655172415, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 1.0688498381320855e-05, |
|
"loss": 0.929, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 8.669950738916256, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 9.924546254786493e-06, |
|
"loss": 0.9381, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 8.719211822660098, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 9.187491363342093e-06, |
|
"loss": 0.9374, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 8.768472906403941, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 8.47755379734373e-06, |
|
"loss": 0.9267, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 8.817733990147783, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 7.794945549701993e-06, |
|
"loss": 0.9369, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 8.866995073891626, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 7.1398704525792e-06, |
|
"loss": 0.9371, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 8.916256157635468, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 6.512524116523633e-06, |
|
"loss": 0.9348, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 8.96551724137931, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 5.913093872058528e-06, |
|
"loss": 0.9169, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 8.995073891625616, |
|
"eval_loss": 2.523988723754883, |
|
"eval_runtime": 0.6711, |
|
"eval_samples_per_second": 14.9, |
|
"eval_steps_per_second": 1.49, |
|
"step": 913 |
|
}, |
|
{ |
|
"epoch": 9.014778325123153, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 5.341758713743828e-06, |
|
"loss": 0.9254, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 9.064039408866995, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 4.798689246727006e-06, |
|
"loss": 0.929, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 9.113300492610838, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 4.2840476357989825e-06, |
|
"loss": 0.9346, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 9.16256157635468, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 3.797987556970495e-06, |
|
"loss": 0.936, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 9.211822660098521, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 3.3406541515832003e-06, |
|
"loss": 0.9283, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 9.261083743842365, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 2.912183982969385e-06, |
|
"loss": 0.9255, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 9.310344827586206, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 2.5127049956730207e-06, |
|
"loss": 0.9252, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 9.35960591133005, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 2.1423364772445887e-06, |
|
"loss": 0.9276, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 9.408866995073891, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 1.8011890226208527e-06, |
|
"loss": 0.9251, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 9.458128078817733, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 1.489364501100332e-06, |
|
"loss": 0.9291, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 9.507389162561577, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 1.2069560259243328e-06, |
|
"loss": 0.9333, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 9.556650246305418, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 9.540479264726676e-07, |
|
"loss": 0.9332, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 9.605911330049262, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 7.307157230821426e-07, |
|
"loss": 0.9297, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 9.655172413793103, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 5.370261044956971e-07, |
|
"loss": 0.9222, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 9.704433497536947, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 3.73036907948543e-07, |
|
"loss": 0.9396, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 9.753694581280788, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 2.3879710189753656e-07, |
|
"loss": 0.931, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 9.80295566502463, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 1.3434677139885222e-07, |
|
"loss": 0.9309, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 9.852216748768473, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 5.971710613821291e-08, |
|
"loss": 0.9223, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 9.901477832512315, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 1.4930391117451426e-08, |
|
"loss": 0.9251, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 9.950738916256158, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 0.0, |
|
"loss": 0.925, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 9.950738916256158, |
|
"eval_loss": 2.523752212524414, |
|
"eval_runtime": 0.5416, |
|
"eval_samples_per_second": 18.464, |
|
"eval_steps_per_second": 1.846, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 9.950738916256158, |
|
"step": 1010, |
|
"total_flos": 5.932470720905871e+17, |
|
"train_loss": 1.07745361446154, |
|
"train_runtime": 3468.402, |
|
"train_samples_per_second": 13.998, |
|
"train_steps_per_second": 0.291 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1010, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 100, |
|
"total_flos": 5.932470720905871e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|