|
{ |
|
"best_metric": 4.30883264541626, |
|
"best_model_checkpoint": "/home/datta0/models/lora_final/Mistral-7B-v0.3_metamath_default/checkpoint-572", |
|
"epoch": 0.9995949777237748, |
|
"eval_steps": 13, |
|
"global_step": 617, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0016200891049007696, |
|
"grad_norm": 31.97899627685547, |
|
"learning_rate": 2.3076923076923076e-05, |
|
"loss": 0.9728, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.011340623734305387, |
|
"grad_norm": 25.20884132385254, |
|
"learning_rate": 0.00016153846153846153, |
|
"loss": 0.7439, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.021061158363710003, |
|
"eval_loss": 4.518104553222656, |
|
"eval_runtime": 13.0104, |
|
"eval_samples_per_second": 38.431, |
|
"eval_steps_per_second": 4.842, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.022681247468610773, |
|
"grad_norm": 9073.8154296875, |
|
"learning_rate": 0.0002999979709808197, |
|
"loss": 1.7362, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.03402187120291616, |
|
"grad_norm": 35.26778793334961, |
|
"learning_rate": 0.0002998701612152596, |
|
"loss": 8.6977, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.042122316727420006, |
|
"eval_loss": 6.681583404541016, |
|
"eval_runtime": 84.9655, |
|
"eval_samples_per_second": 5.885, |
|
"eval_steps_per_second": 0.741, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.04536249493722155, |
|
"grad_norm": 12.110719680786133, |
|
"learning_rate": 0.0002995437011859465, |
|
"loss": 6.8454, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.056703118671526935, |
|
"grad_norm": 3.0557332038879395, |
|
"learning_rate": 0.00029901902360990936, |
|
"loss": 6.6139, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.06318347509113001, |
|
"eval_loss": 6.469002723693848, |
|
"eval_runtime": 82.1892, |
|
"eval_samples_per_second": 6.084, |
|
"eval_steps_per_second": 0.767, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.06804374240583232, |
|
"grad_norm": 3.914517879486084, |
|
"learning_rate": 0.00029829682393805085, |
|
"loss": 6.4334, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.0793843661401377, |
|
"grad_norm": 15.051484107971191, |
|
"learning_rate": 0.0002973780594333385, |
|
"loss": 6.3559, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.08424463345484001, |
|
"eval_loss": 6.440323829650879, |
|
"eval_runtime": 12.6259, |
|
"eval_samples_per_second": 39.601, |
|
"eval_steps_per_second": 4.99, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.0907249898744431, |
|
"grad_norm": 4.897040843963623, |
|
"learning_rate": 0.00029626394790197025, |
|
"loss": 6.31, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.10206561360874848, |
|
"grad_norm": 5.669849872589111, |
|
"learning_rate": 0.00029495596607919305, |
|
"loss": 6.2844, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.10530579181855002, |
|
"eval_loss": 6.300673007965088, |
|
"eval_runtime": 83.0722, |
|
"eval_samples_per_second": 6.019, |
|
"eval_steps_per_second": 0.758, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.11340623734305387, |
|
"grad_norm": 21.01810073852539, |
|
"learning_rate": 0.00029345584767191685, |
|
"loss": 6.2212, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.12474686107735926, |
|
"grad_norm": 4.398944854736328, |
|
"learning_rate": 0.0002917655810607161, |
|
"loss": 6.1379, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.12636695018226002, |
|
"eval_loss": 6.0592122077941895, |
|
"eval_runtime": 91.4719, |
|
"eval_samples_per_second": 5.466, |
|
"eval_steps_per_second": 0.689, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.13608748481166463, |
|
"grad_norm": 3.180732250213623, |
|
"learning_rate": 0.0002898874066642667, |
|
"loss": 5.9832, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.14742810854597002, |
|
"grad_norm": 38.04954147338867, |
|
"learning_rate": 0.00028782381396971003, |
|
"loss": 5.9506, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.14742810854597002, |
|
"eval_loss": 5.914348602294922, |
|
"eval_runtime": 12.9694, |
|
"eval_samples_per_second": 38.552, |
|
"eval_steps_per_second": 4.858, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.1587687322802754, |
|
"grad_norm": 40.107906341552734, |
|
"learning_rate": 0.00028557753823288173, |
|
"loss": 5.8886, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.16848926690968002, |
|
"eval_loss": 5.989041805267334, |
|
"eval_runtime": 88.0067, |
|
"eval_samples_per_second": 5.681, |
|
"eval_steps_per_second": 0.716, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.1701093560145808, |
|
"grad_norm": 47.72571563720703, |
|
"learning_rate": 0.0002831515568527781, |
|
"loss": 5.8829, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.1814499797488862, |
|
"grad_norm": 8.360991477966309, |
|
"learning_rate": 0.00028054908542506627, |
|
"loss": 5.7387, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.18955042527339003, |
|
"eval_loss": 5.65593957901001, |
|
"eval_runtime": 82.1522, |
|
"eval_samples_per_second": 6.086, |
|
"eval_steps_per_second": 0.767, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.19279060348319157, |
|
"grad_norm": 21.80311393737793, |
|
"learning_rate": 0.00027777357347986823, |
|
"loss": 5.6862, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.20413122721749696, |
|
"grad_norm": 19.518041610717773, |
|
"learning_rate": 0.00027482869990946986, |
|
"loss": 5.672, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.21061158363710003, |
|
"eval_loss": 5.724035739898682, |
|
"eval_runtime": 12.9581, |
|
"eval_samples_per_second": 38.586, |
|
"eval_steps_per_second": 4.862, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.21547185095180235, |
|
"grad_norm": 20.255475997924805, |
|
"learning_rate": 0.0002717183680920135, |
|
"loss": 5.6345, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.22681247468610774, |
|
"grad_norm": 8.896872520446777, |
|
"learning_rate": 0.00026844670071763906, |
|
"loss": 5.5425, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.23167274200081003, |
|
"eval_loss": 5.5164642333984375, |
|
"eval_runtime": 90.7285, |
|
"eval_samples_per_second": 5.511, |
|
"eval_steps_per_second": 0.694, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.23815309842041313, |
|
"grad_norm": 22.878341674804688, |
|
"learning_rate": 0.00026501803432393037, |
|
"loss": 5.4354, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.24949372215471852, |
|
"grad_norm": 25.566158294677734, |
|
"learning_rate": 0.00026143691354791145, |
|
"loss": 5.473, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.25273390036452004, |
|
"eval_loss": 5.423782825469971, |
|
"eval_runtime": 86.2539, |
|
"eval_samples_per_second": 5.797, |
|
"eval_steps_per_second": 0.73, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.2608343458890239, |
|
"grad_norm": 11.549739837646484, |
|
"learning_rate": 0.00025770808510220956, |
|
"loss": 5.3333, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.27217496962332927, |
|
"grad_norm": 16.626855850219727, |
|
"learning_rate": 0.00025383649148337105, |
|
"loss": 5.328, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.27379505872823007, |
|
"eval_loss": 5.343407154083252, |
|
"eval_runtime": 13.0401, |
|
"eval_samples_per_second": 38.343, |
|
"eval_steps_per_second": 4.831, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.28351559335763465, |
|
"grad_norm": 34.26865005493164, |
|
"learning_rate": 0.0002498272644206695, |
|
"loss": 5.2502, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.29485621709194004, |
|
"grad_norm": 8.343174934387207, |
|
"learning_rate": 0.0002456857180740884, |
|
"loss": 5.241, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.29485621709194004, |
|
"eval_loss": 5.208798885345459, |
|
"eval_runtime": 77.0828, |
|
"eval_samples_per_second": 6.487, |
|
"eval_steps_per_second": 0.817, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.30619684082624543, |
|
"grad_norm": 29.43370819091797, |
|
"learning_rate": 0.0002414173419904956, |
|
"loss": 5.2491, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.3159173754556501, |
|
"eval_loss": 5.329117298126221, |
|
"eval_runtime": 87.9681, |
|
"eval_samples_per_second": 5.684, |
|
"eval_steps_per_second": 0.716, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.3175374645605508, |
|
"grad_norm": 41.14483642578125, |
|
"learning_rate": 0.00023702779382734566, |
|
"loss": 5.3369, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.3288780882948562, |
|
"grad_norm": 12.607691764831543, |
|
"learning_rate": 0.0002325228918535541, |
|
"loss": 5.2994, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.33697853381936005, |
|
"eval_loss": 5.168725490570068, |
|
"eval_runtime": 13.0279, |
|
"eval_samples_per_second": 38.379, |
|
"eval_steps_per_second": 4.836, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.3402187120291616, |
|
"grad_norm": 13.904136657714844, |
|
"learning_rate": 0.00022790860723748442, |
|
"loss": 5.1467, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.351559335763467, |
|
"grad_norm": 27.545360565185547, |
|
"learning_rate": 0.00022319105613226921, |
|
"loss": 5.1595, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.3580396921830701, |
|
"eval_loss": 5.07974910736084, |
|
"eval_runtime": 93.1844, |
|
"eval_samples_per_second": 5.366, |
|
"eval_steps_per_second": 0.676, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.3628999594977724, |
|
"grad_norm": 9.475492477416992, |
|
"learning_rate": 0.00021837649156895706, |
|
"loss": 5.1084, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.37424058323207776, |
|
"grad_norm": 9.638204574584961, |
|
"learning_rate": 0.00021347129516822945, |
|
"loss": 5.0592, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.37910085054678005, |
|
"eval_loss": 5.000487327575684, |
|
"eval_runtime": 87.2972, |
|
"eval_samples_per_second": 5.728, |
|
"eval_steps_per_second": 0.722, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.38558120696638315, |
|
"grad_norm": 19.830158233642578, |
|
"learning_rate": 0.00020848196868167505, |
|
"loss": 5.0021, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.39692183070068854, |
|
"grad_norm": 40.01640319824219, |
|
"learning_rate": 0.000203415125373832, |
|
"loss": 4.9674, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.4001620089104901, |
|
"eval_loss": 4.952455520629883, |
|
"eval_runtime": 13.0727, |
|
"eval_samples_per_second": 38.248, |
|
"eval_steps_per_second": 4.819, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.4082624544349939, |
|
"grad_norm": 45.92347717285156, |
|
"learning_rate": 0.00019827748125642242, |
|
"loss": 4.9861, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.4196030781692993, |
|
"grad_norm": 7.856499195098877, |
|
"learning_rate": 0.0001930758461863965, |
|
"loss": 4.9663, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.42122316727420006, |
|
"eval_loss": 4.970428943634033, |
|
"eval_runtime": 79.1825, |
|
"eval_samples_per_second": 6.315, |
|
"eval_steps_per_second": 0.796, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.4309437019036047, |
|
"grad_norm": 45.39262771606445, |
|
"learning_rate": 0.0001878171148395872, |
|
"loss": 4.9635, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.4422843256379101, |
|
"grad_norm": 49.14884567260742, |
|
"learning_rate": 0.00018250825757193848, |
|
"loss": 5.0169, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.4422843256379101, |
|
"eval_loss": 4.971817970275879, |
|
"eval_runtime": 80.0188, |
|
"eval_samples_per_second": 6.249, |
|
"eval_steps_per_second": 0.787, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.4536249493722155, |
|
"grad_norm": 13.595600128173828, |
|
"learning_rate": 0.0001771563111804211, |
|
"loss": 4.9333, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.46334548400162007, |
|
"eval_loss": 4.827688694000244, |
|
"eval_runtime": 13.1128, |
|
"eval_samples_per_second": 38.131, |
|
"eval_steps_per_second": 4.804, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.46496557310652087, |
|
"grad_norm": 21.398765563964844, |
|
"learning_rate": 0.0001717683695758819, |
|
"loss": 4.7715, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.47630619684082626, |
|
"grad_norm": 17.379562377929688, |
|
"learning_rate": 0.00016635157438018983, |
|
"loss": 4.8687, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.4844066423653301, |
|
"eval_loss": 4.813088893890381, |
|
"eval_runtime": 90.9286, |
|
"eval_samples_per_second": 5.499, |
|
"eval_steps_per_second": 0.693, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.48764682057513165, |
|
"grad_norm": 36.88113021850586, |
|
"learning_rate": 0.0001609131054601416, |
|
"loss": 4.8246, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.49898744430943703, |
|
"grad_norm": 27.84221076965332, |
|
"learning_rate": 0.00015546017141067432, |
|
"loss": 4.7215, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.5054678007290401, |
|
"eval_loss": 4.7605767250061035, |
|
"eval_runtime": 93.5587, |
|
"eval_samples_per_second": 5.344, |
|
"eval_steps_per_second": 0.673, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.5103280680437424, |
|
"grad_norm": 31.899227142333984, |
|
"learning_rate": 0.00015, |
|
"loss": 4.7799, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.5216686917780478, |
|
"grad_norm": 33.652374267578125, |
|
"learning_rate": 0.0001445398285893257, |
|
"loss": 4.7602, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.5265289590927501, |
|
"eval_loss": 4.704898834228516, |
|
"eval_runtime": 13.0792, |
|
"eval_samples_per_second": 38.229, |
|
"eval_steps_per_second": 4.817, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.5330093155123532, |
|
"grad_norm": 54.98084259033203, |
|
"learning_rate": 0.0001390868945398584, |
|
"loss": 4.6746, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.5443499392466585, |
|
"grad_norm": 29.5, |
|
"learning_rate": 0.00013364842561981014, |
|
"loss": 4.7033, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.5475901174564601, |
|
"eval_loss": 4.78165340423584, |
|
"eval_runtime": 92.9113, |
|
"eval_samples_per_second": 5.381, |
|
"eval_steps_per_second": 0.678, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.555690562980964, |
|
"grad_norm": 16.196989059448242, |
|
"learning_rate": 0.00012823163042411807, |
|
"loss": 4.6363, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.5670311867152693, |
|
"grad_norm": 22.850570678710938, |
|
"learning_rate": 0.0001228436888195789, |
|
"loss": 4.7179, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.56865127582017, |
|
"eval_loss": 4.642824172973633, |
|
"eval_runtime": 79.0871, |
|
"eval_samples_per_second": 6.322, |
|
"eval_steps_per_second": 0.797, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.5783718104495748, |
|
"grad_norm": 19.261425018310547, |
|
"learning_rate": 0.00011749174242806152, |
|
"loss": 4.6463, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.5897124341838801, |
|
"grad_norm": 21.468109130859375, |
|
"learning_rate": 0.00011218288516041279, |
|
"loss": 4.6525, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.5897124341838801, |
|
"eval_loss": 4.596373081207275, |
|
"eval_runtime": 13.1407, |
|
"eval_samples_per_second": 38.05, |
|
"eval_steps_per_second": 4.794, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.6010530579181855, |
|
"grad_norm": 21.91275405883789, |
|
"learning_rate": 0.00010692415381360349, |
|
"loss": 4.5923, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.6107735925475901, |
|
"eval_loss": 4.560819149017334, |
|
"eval_runtime": 78.8178, |
|
"eval_samples_per_second": 6.344, |
|
"eval_steps_per_second": 0.799, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.6123936816524909, |
|
"grad_norm": 33.42408752441406, |
|
"learning_rate": 0.00010172251874357757, |
|
"loss": 4.6014, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.6237343053867963, |
|
"grad_norm": 24.099485397338867, |
|
"learning_rate": 9.658487462616794e-05, |
|
"loss": 4.5936, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.6318347509113001, |
|
"eval_loss": 4.567561626434326, |
|
"eval_runtime": 88.6357, |
|
"eval_samples_per_second": 5.641, |
|
"eval_steps_per_second": 0.711, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.6350749291211016, |
|
"grad_norm": 19.200098037719727, |
|
"learning_rate": 9.151803131832493e-05, |
|
"loss": 4.5747, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.6464155528554071, |
|
"grad_norm": 32.721920013427734, |
|
"learning_rate": 8.652870483177049e-05, |
|
"loss": 4.5142, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.6528959092750102, |
|
"eval_loss": 4.501616954803467, |
|
"eval_runtime": 13.1244, |
|
"eval_samples_per_second": 38.097, |
|
"eval_steps_per_second": 4.8, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.6577561765897124, |
|
"grad_norm": 18.631881713867188, |
|
"learning_rate": 8.162350843104291e-05, |
|
"loss": 4.4781, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.6690968003240179, |
|
"grad_norm": 9.993534088134766, |
|
"learning_rate": 7.680894386773072e-05, |
|
"loss": 4.4717, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.6739570676387201, |
|
"eval_loss": 4.4421563148498535, |
|
"eval_runtime": 92.2744, |
|
"eval_samples_per_second": 5.419, |
|
"eval_steps_per_second": 0.683, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.6804374240583232, |
|
"grad_norm": 54.20478820800781, |
|
"learning_rate": 7.209139276251558e-05, |
|
"loss": 4.4965, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.6917780477926286, |
|
"grad_norm": 8.450617790222168, |
|
"learning_rate": 6.747710814644589e-05, |
|
"loss": 4.5539, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.6950182260024301, |
|
"eval_loss": 4.517686367034912, |
|
"eval_runtime": 92.691, |
|
"eval_samples_per_second": 5.394, |
|
"eval_steps_per_second": 0.68, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.703118671526934, |
|
"grad_norm": 15.6026029586792, |
|
"learning_rate": 6.297220617265435e-05, |
|
"loss": 4.4799, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.7144592952612394, |
|
"grad_norm": 43.13425064086914, |
|
"learning_rate": 5.858265800950438e-05, |
|
"loss": 4.5129, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.7160793843661402, |
|
"eval_loss": 4.539721488952637, |
|
"eval_runtime": 13.1249, |
|
"eval_samples_per_second": 38.095, |
|
"eval_steps_per_second": 4.8, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.7257999189955447, |
|
"grad_norm": 23.415088653564453, |
|
"learning_rate": 5.4314281925911634e-05, |
|
"loss": 4.4859, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.7371405427298502, |
|
"grad_norm": 25.57401466369629, |
|
"learning_rate": 5.0172735579330526e-05, |
|
"loss": 4.4162, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.7371405427298502, |
|
"eval_loss": 4.4049577713012695, |
|
"eval_runtime": 83.3117, |
|
"eval_samples_per_second": 6.002, |
|
"eval_steps_per_second": 0.756, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.7484811664641555, |
|
"grad_norm": 18.13247299194336, |
|
"learning_rate": 4.616350851662895e-05, |
|
"loss": 4.4328, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.7582017010935601, |
|
"eval_loss": 4.424379348754883, |
|
"eval_runtime": 78.9244, |
|
"eval_samples_per_second": 6.335, |
|
"eval_steps_per_second": 0.798, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.759821790198461, |
|
"grad_norm": 31.204004287719727, |
|
"learning_rate": 4.229191489779047e-05, |
|
"loss": 4.3971, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.7711624139327663, |
|
"grad_norm": 34.741661071777344, |
|
"learning_rate": 3.8563086452088506e-05, |
|
"loss": 4.3949, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.7792628594572701, |
|
"eval_loss": 4.401327133178711, |
|
"eval_runtime": 13.1531, |
|
"eval_samples_per_second": 38.014, |
|
"eval_steps_per_second": 4.79, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.7825030376670717, |
|
"grad_norm": 30.405717849731445, |
|
"learning_rate": 3.498196567606959e-05, |
|
"loss": 4.3752, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.7938436614013771, |
|
"grad_norm": 27.06761932373047, |
|
"learning_rate": 3.1553299282360966e-05, |
|
"loss": 4.3946, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.8003240178209802, |
|
"eval_loss": 4.372068405151367, |
|
"eval_runtime": 95.0407, |
|
"eval_samples_per_second": 5.261, |
|
"eval_steps_per_second": 0.663, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.8051842851356824, |
|
"grad_norm": 11.737322807312012, |
|
"learning_rate": 2.828163190798644e-05, |
|
"loss": 4.3378, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.8165249088699879, |
|
"grad_norm": 27.00642204284668, |
|
"learning_rate": 2.5171300090530106e-05, |
|
"loss": 4.393, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.8213851761846902, |
|
"eval_loss": 4.358621120452881, |
|
"eval_runtime": 92.1383, |
|
"eval_samples_per_second": 5.427, |
|
"eval_steps_per_second": 0.684, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 0.8278655326042932, |
|
"grad_norm": 22.39855194091797, |
|
"learning_rate": 2.2226426520131734e-05, |
|
"loss": 4.3436, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 0.8392061563385986, |
|
"grad_norm": 15.117547988891602, |
|
"learning_rate": 1.9450914574933725e-05, |
|
"loss": 4.3872, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 0.8424463345484001, |
|
"eval_loss": 4.355201244354248, |
|
"eval_runtime": 13.114, |
|
"eval_samples_per_second": 38.127, |
|
"eval_steps_per_second": 4.804, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.850546780072904, |
|
"grad_norm": 22.19342041015625, |
|
"learning_rate": 1.6848443147221828e-05, |
|
"loss": 4.3434, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.8618874038072094, |
|
"grad_norm": 21.237140655517578, |
|
"learning_rate": 1.4422461767118233e-05, |
|
"loss": 4.3787, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.8635074929121102, |
|
"eval_loss": 4.404301166534424, |
|
"eval_runtime": 89.2424, |
|
"eval_samples_per_second": 5.603, |
|
"eval_steps_per_second": 0.706, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 0.8732280275415147, |
|
"grad_norm": 11.590981483459473, |
|
"learning_rate": 1.2176186030289936e-05, |
|
"loss": 4.3491, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 0.8845686512758202, |
|
"grad_norm": 7.868985652923584, |
|
"learning_rate": 1.011259333573326e-05, |
|
"loss": 4.3477, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 0.8845686512758202, |
|
"eval_loss": 4.3536529541015625, |
|
"eval_runtime": 77.8001, |
|
"eval_samples_per_second": 6.427, |
|
"eval_steps_per_second": 0.81, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 0.8959092750101255, |
|
"grad_norm": 22.794200897216797, |
|
"learning_rate": 8.234418939283866e-06, |
|
"loss": 4.3957, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 0.9056298096395302, |
|
"eval_loss": 4.333388805389404, |
|
"eval_runtime": 13.1267, |
|
"eval_samples_per_second": 38.09, |
|
"eval_steps_per_second": 4.799, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 0.907249898744431, |
|
"grad_norm": 11.550942420959473, |
|
"learning_rate": 6.544152328083152e-06, |
|
"loss": 4.3827, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.9185905224787363, |
|
"grad_norm": 11.666648864746094, |
|
"learning_rate": 5.044033920806933e-06, |
|
"loss": 4.3634, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 0.9266909680032401, |
|
"eval_loss": 4.30883264541626, |
|
"eval_runtime": 89.3126, |
|
"eval_samples_per_second": 5.598, |
|
"eval_steps_per_second": 0.705, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 0.9299311462130417, |
|
"grad_norm": 13.062474250793457, |
|
"learning_rate": 3.7360520980297514e-06, |
|
"loss": 4.3485, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 0.9412717699473471, |
|
"grad_norm": 10.294334411621094, |
|
"learning_rate": 2.6219405666614402e-06, |
|
"loss": 4.2898, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 0.9477521263669502, |
|
"eval_loss": 4.334242820739746, |
|
"eval_runtime": 92.3717, |
|
"eval_samples_per_second": 5.413, |
|
"eval_steps_per_second": 0.682, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.9526123936816525, |
|
"grad_norm": 8.134040832519531, |
|
"learning_rate": 1.7031760619491353e-06, |
|
"loss": 4.3431, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 0.9639530174159578, |
|
"grad_norm": 9.691034317016602, |
|
"learning_rate": 9.809763900905875e-07, |
|
"loss": 4.355, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.9688132847306602, |
|
"eval_loss": 4.32021951675415, |
|
"eval_runtime": 13.1964, |
|
"eval_samples_per_second": 37.889, |
|
"eval_steps_per_second": 4.774, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 0.9752936411502633, |
|
"grad_norm": 14.163640975952148, |
|
"learning_rate": 4.562988140535073e-07, |
|
"loss": 4.327, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 0.9866342648845686, |
|
"grad_norm": 8.559078216552734, |
|
"learning_rate": 1.298387847403437e-07, |
|
"loss": 4.3331, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 0.9898744430943702, |
|
"eval_loss": 4.323936462402344, |
|
"eval_runtime": 86.9395, |
|
"eval_samples_per_second": 5.751, |
|
"eval_steps_per_second": 0.725, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 0.9979748886188741, |
|
"grad_norm": 6.521531105041504, |
|
"learning_rate": 2.029019180288527e-09, |
|
"loss": 4.3108, |
|
"step": 616 |
|
} |
|
], |
|
"logging_steps": 7, |
|
"max_steps": 617, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 13, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.001836583160381e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|