|
{ |
|
"best_metric": 2.2199668884277344, |
|
"best_model_checkpoint": "/home/datta0/models/lora_final/Meta-Llama-3-8B_pct_ortho/checkpoint-328", |
|
"epoch": 0.9990344383649823, |
|
"eval_steps": 8, |
|
"global_step": 388, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.002574831026713872, |
|
"grad_norm": 4.8250555992126465, |
|
"learning_rate": 3.75e-05, |
|
"loss": 2.3613, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.010299324106855488, |
|
"grad_norm": 3.812638998031616, |
|
"learning_rate": 0.00015, |
|
"loss": 2.3464, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.020598648213710977, |
|
"grad_norm": 4.135324478149414, |
|
"learning_rate": 0.0003, |
|
"loss": 2.2594, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.020598648213710977, |
|
"eval_loss": 2.2432966232299805, |
|
"eval_runtime": 10.4685, |
|
"eval_samples_per_second": 23.404, |
|
"eval_steps_per_second": 2.961, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.03089797232056646, |
|
"grad_norm": 3.940943956375122, |
|
"learning_rate": 0.0002999179886011389, |
|
"loss": 2.2353, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.04119729642742195, |
|
"grad_norm": 3.0164599418640137, |
|
"learning_rate": 0.00029967204408281613, |
|
"loss": 2.2616, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.04119729642742195, |
|
"eval_loss": 2.2376890182495117, |
|
"eval_runtime": 10.472, |
|
"eval_samples_per_second": 23.396, |
|
"eval_steps_per_second": 2.96, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.051496620534277435, |
|
"grad_norm": 3.438307046890259, |
|
"learning_rate": 0.0002992624353817517, |
|
"loss": 2.2166, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.06179594464113292, |
|
"grad_norm": 3.0681569576263428, |
|
"learning_rate": 0.00029868961039904624, |
|
"loss": 2.2041, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.06179594464113292, |
|
"eval_loss": 2.2342782020568848, |
|
"eval_runtime": 10.4748, |
|
"eval_samples_per_second": 23.389, |
|
"eval_steps_per_second": 2.959, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.07209526874798841, |
|
"grad_norm": 2.55240535736084, |
|
"learning_rate": 0.00029795419551040833, |
|
"loss": 2.2296, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.0823945928548439, |
|
"grad_norm": 2.451361894607544, |
|
"learning_rate": 0.0002970569948812214, |
|
"loss": 2.2802, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.0823945928548439, |
|
"eval_loss": 2.253261089324951, |
|
"eval_runtime": 10.4553, |
|
"eval_samples_per_second": 23.433, |
|
"eval_steps_per_second": 2.965, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.09269391696169939, |
|
"grad_norm": 5.104744911193848, |
|
"learning_rate": 0.0002959989895872009, |
|
"loss": 2.2541, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.10299324106855487, |
|
"grad_norm": 2.3457939624786377, |
|
"learning_rate": 0.0002947813365416023, |
|
"loss": 2.2832, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.10299324106855487, |
|
"eval_loss": 2.2618255615234375, |
|
"eval_runtime": 10.4563, |
|
"eval_samples_per_second": 23.431, |
|
"eval_steps_per_second": 2.965, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.11329256517541036, |
|
"grad_norm": 3.244114398956299, |
|
"learning_rate": 0.0002934053672301536, |
|
"loss": 2.2859, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.12359188928226585, |
|
"grad_norm": 2.5838561058044434, |
|
"learning_rate": 0.00029187258625509513, |
|
"loss": 2.2484, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.12359188928226585, |
|
"eval_loss": 2.25663685798645, |
|
"eval_runtime": 10.4533, |
|
"eval_samples_per_second": 23.438, |
|
"eval_steps_per_second": 2.966, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.13389121338912133, |
|
"grad_norm": 2.609206199645996, |
|
"learning_rate": 0.0002901846696899191, |
|
"loss": 2.3116, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.14419053749597682, |
|
"grad_norm": 3.638091802597046, |
|
"learning_rate": 0.0002883434632466077, |
|
"loss": 2.2735, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.14419053749597682, |
|
"eval_loss": 2.2576816082000732, |
|
"eval_runtime": 10.4026, |
|
"eval_samples_per_second": 23.552, |
|
"eval_steps_per_second": 2.98, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.15448986160283232, |
|
"grad_norm": 2.5911107063293457, |
|
"learning_rate": 0.00028635098025737434, |
|
"loss": 2.2803, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1647891857096878, |
|
"grad_norm": 2.4840705394744873, |
|
"learning_rate": 0.0002842093994731145, |
|
"loss": 2.293, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.1647891857096878, |
|
"eval_loss": 2.2783665657043457, |
|
"eval_runtime": 10.3852, |
|
"eval_samples_per_second": 23.591, |
|
"eval_steps_per_second": 2.985, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.17508850981654328, |
|
"grad_norm": 2.24977970123291, |
|
"learning_rate": 0.00028192106268097334, |
|
"loss": 2.3169, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.18538783392339878, |
|
"grad_norm": 2.666362762451172, |
|
"learning_rate": 0.0002794884721436361, |
|
"loss": 2.2518, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.18538783392339878, |
|
"eval_loss": 2.2818338871002197, |
|
"eval_runtime": 10.3527, |
|
"eval_samples_per_second": 23.665, |
|
"eval_steps_per_second": 2.994, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.19568715803025427, |
|
"grad_norm": 2.480801582336426, |
|
"learning_rate": 0.0002769142878631403, |
|
"loss": 2.3334, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.20598648213710974, |
|
"grad_norm": 2.343674898147583, |
|
"learning_rate": 0.000274201324672203, |
|
"loss": 2.2922, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.20598648213710974, |
|
"eval_loss": 2.291419744491577, |
|
"eval_runtime": 10.3695, |
|
"eval_samples_per_second": 23.627, |
|
"eval_steps_per_second": 2.99, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.21628580624396523, |
|
"grad_norm": 2.1927764415740967, |
|
"learning_rate": 0.0002713525491562421, |
|
"loss": 2.2794, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.22658513035082073, |
|
"grad_norm": 2.764920711517334, |
|
"learning_rate": 0.00026837107640945905, |
|
"loss": 2.3199, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.22658513035082073, |
|
"eval_loss": 2.2720041275024414, |
|
"eval_runtime": 10.3284, |
|
"eval_samples_per_second": 23.721, |
|
"eval_steps_per_second": 3.001, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.23688445445767622, |
|
"grad_norm": 2.5399646759033203, |
|
"learning_rate": 0.00026526016662852886, |
|
"loss": 2.2966, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.2471837785645317, |
|
"grad_norm": 2.4544336795806885, |
|
"learning_rate": 0.0002620232215476231, |
|
"loss": 2.3807, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.2471837785645317, |
|
"eval_loss": 2.278792142868042, |
|
"eval_runtime": 10.3488, |
|
"eval_samples_per_second": 23.674, |
|
"eval_steps_per_second": 2.996, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.2574831026713872, |
|
"grad_norm": 2.6318697929382324, |
|
"learning_rate": 0.00025866378071866334, |
|
"loss": 2.3591, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.26778242677824265, |
|
"grad_norm": 2.789501667022705, |
|
"learning_rate": 0.00025518551764087326, |
|
"loss": 2.3528, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.26778242677824265, |
|
"eval_loss": 2.2830569744110107, |
|
"eval_runtime": 10.2996, |
|
"eval_samples_per_second": 23.787, |
|
"eval_steps_per_second": 3.01, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.2780817508850982, |
|
"grad_norm": 2.526961326599121, |
|
"learning_rate": 0.00025159223574386114, |
|
"loss": 2.3028, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.28838107499195365, |
|
"grad_norm": 3.0287418365478516, |
|
"learning_rate": 0.00024788786422862526, |
|
"loss": 2.3144, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.28838107499195365, |
|
"eval_loss": 2.295750379562378, |
|
"eval_runtime": 89.6533, |
|
"eval_samples_per_second": 2.733, |
|
"eval_steps_per_second": 0.346, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.29868039909880917, |
|
"grad_norm": 3.0724334716796875, |
|
"learning_rate": 0.00024407645377103054, |
|
"loss": 2.3251, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.30897972320566464, |
|
"grad_norm": 3.3411991596221924, |
|
"learning_rate": 0.00024016217209245374, |
|
"loss": 2.3652, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.30897972320566464, |
|
"eval_loss": 2.295003652572632, |
|
"eval_runtime": 92.6572, |
|
"eval_samples_per_second": 2.644, |
|
"eval_steps_per_second": 0.335, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3192790473125201, |
|
"grad_norm": 2.637554407119751, |
|
"learning_rate": 0.0002361492994024415, |
|
"loss": 2.2994, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.3295783714193756, |
|
"grad_norm": 2.461324453353882, |
|
"learning_rate": 0.00023204222371836405, |
|
"loss": 2.3637, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.3295783714193756, |
|
"eval_loss": 2.2845101356506348, |
|
"eval_runtime": 94.8437, |
|
"eval_samples_per_second": 2.583, |
|
"eval_steps_per_second": 0.327, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.3398776955262311, |
|
"grad_norm": 2.624124765396118, |
|
"learning_rate": 0.00022784543606718227, |
|
"loss": 2.332, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.35017701963308656, |
|
"grad_norm": 2.3923728466033936, |
|
"learning_rate": 0.0002235635255745762, |
|
"loss": 2.3014, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.35017701963308656, |
|
"eval_loss": 2.278064250946045, |
|
"eval_runtime": 101.7542, |
|
"eval_samples_per_second": 2.408, |
|
"eval_steps_per_second": 0.305, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.3604763437399421, |
|
"grad_norm": 3.1025068759918213, |
|
"learning_rate": 0.00021920117444680317, |
|
"loss": 2.3819, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.37077566784679755, |
|
"grad_norm": 2.671719551086426, |
|
"learning_rate": 0.0002147631528507739, |
|
"loss": 2.3067, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.37077566784679755, |
|
"eval_loss": 2.2829501628875732, |
|
"eval_runtime": 101.7933, |
|
"eval_samples_per_second": 2.407, |
|
"eval_steps_per_second": 0.305, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.381074991953653, |
|
"grad_norm": 2.460653781890869, |
|
"learning_rate": 0.0002102543136979454, |
|
"loss": 2.2942, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.39137431606050854, |
|
"grad_norm": 2.579962730407715, |
|
"learning_rate": 0.0002056795873377331, |
|
"loss": 2.3242, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.39137431606050854, |
|
"eval_loss": 2.2787716388702393, |
|
"eval_runtime": 105.0919, |
|
"eval_samples_per_second": 2.331, |
|
"eval_steps_per_second": 0.295, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.401673640167364, |
|
"grad_norm": 2.271052837371826, |
|
"learning_rate": 0.00020104397616624645, |
|
"loss": 2.3009, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.4119729642742195, |
|
"grad_norm": 3.09191632270813, |
|
"learning_rate": 0.0001963525491562421, |
|
"loss": 2.3184, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4119729642742195, |
|
"eval_loss": 2.265949249267578, |
|
"eval_runtime": 100.9124, |
|
"eval_samples_per_second": 2.428, |
|
"eval_steps_per_second": 0.307, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.422272288381075, |
|
"grad_norm": 2.4084908962249756, |
|
"learning_rate": 0.00019161043631427666, |
|
"loss": 2.2862, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.43257161248793047, |
|
"grad_norm": 2.453981399536133, |
|
"learning_rate": 0.00018682282307111987, |
|
"loss": 2.3574, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.43257161248793047, |
|
"eval_loss": 2.278163433074951, |
|
"eval_runtime": 10.4957, |
|
"eval_samples_per_second": 23.343, |
|
"eval_steps_per_second": 2.954, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.442870936594786, |
|
"grad_norm": 2.5125510692596436, |
|
"learning_rate": 0.00018199494461156203, |
|
"loss": 2.3157, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.45317026070164146, |
|
"grad_norm": 3.002711057662964, |
|
"learning_rate": 0.00017713208014981648, |
|
"loss": 2.3006, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.45317026070164146, |
|
"eval_loss": 2.2733473777770996, |
|
"eval_runtime": 10.4821, |
|
"eval_samples_per_second": 23.373, |
|
"eval_steps_per_second": 2.957, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.4634695848084969, |
|
"grad_norm": 2.4078829288482666, |
|
"learning_rate": 0.00017223954715677627, |
|
"loss": 2.3307, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.47376890891535245, |
|
"grad_norm": 2.81722092628479, |
|
"learning_rate": 0.00016732269554543794, |
|
"loss": 2.3082, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.47376890891535245, |
|
"eval_loss": 2.2698850631713867, |
|
"eval_runtime": 10.4759, |
|
"eval_samples_per_second": 23.387, |
|
"eval_steps_per_second": 2.959, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.4840682330222079, |
|
"grad_norm": 2.851599931716919, |
|
"learning_rate": 0.00016238690182084986, |
|
"loss": 2.2678, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.4943675571290634, |
|
"grad_norm": 3.283416271209717, |
|
"learning_rate": 0.00015743756320098332, |
|
"loss": 2.3097, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.4943675571290634, |
|
"eval_loss": 2.2615249156951904, |
|
"eval_runtime": 10.4874, |
|
"eval_samples_per_second": 23.361, |
|
"eval_steps_per_second": 2.956, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.5046668812359189, |
|
"grad_norm": 2.369040012359619, |
|
"learning_rate": 0.00015248009171495378, |
|
"loss": 2.304, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.5149662053427744, |
|
"grad_norm": 2.1018686294555664, |
|
"learning_rate": 0.00014751990828504622, |
|
"loss": 2.3003, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5149662053427744, |
|
"eval_loss": 2.264901638031006, |
|
"eval_runtime": 10.4653, |
|
"eval_samples_per_second": 23.411, |
|
"eval_steps_per_second": 2.962, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5252655294496299, |
|
"grad_norm": 2.4146642684936523, |
|
"learning_rate": 0.00014256243679901663, |
|
"loss": 2.2735, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.5355648535564853, |
|
"grad_norm": 2.4833011627197266, |
|
"learning_rate": 0.00013761309817915014, |
|
"loss": 2.3027, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.5355648535564853, |
|
"eval_loss": 2.2593843936920166, |
|
"eval_runtime": 10.4677, |
|
"eval_samples_per_second": 23.405, |
|
"eval_steps_per_second": 2.961, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.5458641776633408, |
|
"grad_norm": 2.3660264015197754, |
|
"learning_rate": 0.00013267730445456208, |
|
"loss": 2.2475, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.5561635017701964, |
|
"grad_norm": 2.8252508640289307, |
|
"learning_rate": 0.00012776045284322368, |
|
"loss": 2.3262, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.5561635017701964, |
|
"eval_loss": 2.249122142791748, |
|
"eval_runtime": 10.4377, |
|
"eval_samples_per_second": 23.473, |
|
"eval_steps_per_second": 2.97, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.5664628258770518, |
|
"grad_norm": 2.3867785930633545, |
|
"learning_rate": 0.00012286791985018355, |
|
"loss": 2.2732, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5767621499839073, |
|
"grad_norm": 2.2431817054748535, |
|
"learning_rate": 0.00011800505538843798, |
|
"loss": 2.3118, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.5767621499839073, |
|
"eval_loss": 2.2598836421966553, |
|
"eval_runtime": 10.3894, |
|
"eval_samples_per_second": 23.582, |
|
"eval_steps_per_second": 2.984, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.5870614740907628, |
|
"grad_norm": 2.2373931407928467, |
|
"learning_rate": 0.00011317717692888012, |
|
"loss": 2.3041, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.5973607981976183, |
|
"grad_norm": 2.278763771057129, |
|
"learning_rate": 0.00010838956368572334, |
|
"loss": 2.2904, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.5973607981976183, |
|
"eval_loss": 2.26226544380188, |
|
"eval_runtime": 10.39, |
|
"eval_samples_per_second": 23.58, |
|
"eval_steps_per_second": 2.984, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.6076601223044737, |
|
"grad_norm": 2.2224009037017822, |
|
"learning_rate": 0.0001036474508437579, |
|
"loss": 2.3356, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.6179594464113293, |
|
"grad_norm": 1.9502872228622437, |
|
"learning_rate": 9.895602383375353e-05, |
|
"loss": 2.2519, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6179594464113293, |
|
"eval_loss": 2.249502658843994, |
|
"eval_runtime": 10.3323, |
|
"eval_samples_per_second": 23.712, |
|
"eval_steps_per_second": 3.0, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6282587705181848, |
|
"grad_norm": 1.9756816625595093, |
|
"learning_rate": 9.432041266226686e-05, |
|
"loss": 2.3011, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.6385580946250402, |
|
"grad_norm": 2.24442195892334, |
|
"learning_rate": 8.97456863020546e-05, |
|
"loss": 2.2907, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.6385580946250402, |
|
"eval_loss": 2.252591848373413, |
|
"eval_runtime": 10.375, |
|
"eval_samples_per_second": 23.615, |
|
"eval_steps_per_second": 2.988, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.6488574187318957, |
|
"grad_norm": 2.2955076694488525, |
|
"learning_rate": 8.523684714922608e-05, |
|
"loss": 2.2493, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.6591567428387513, |
|
"grad_norm": 2.3158631324768066, |
|
"learning_rate": 8.079882555319684e-05, |
|
"loss": 2.2864, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.6591567428387513, |
|
"eval_loss": 2.25117826461792, |
|
"eval_runtime": 10.3184, |
|
"eval_samples_per_second": 23.744, |
|
"eval_steps_per_second": 3.004, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.6694560669456067, |
|
"grad_norm": 1.8003387451171875, |
|
"learning_rate": 7.643647442542382e-05, |
|
"loss": 2.2806, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6797553910524622, |
|
"grad_norm": 2.0942351818084717, |
|
"learning_rate": 7.215456393281776e-05, |
|
"loss": 2.242, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.6797553910524622, |
|
"eval_loss": 2.249201774597168, |
|
"eval_runtime": 10.2986, |
|
"eval_samples_per_second": 23.79, |
|
"eval_steps_per_second": 3.01, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.6900547151593177, |
|
"grad_norm": 2.113708019256592, |
|
"learning_rate": 6.795777628163599e-05, |
|
"loss": 2.2817, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.7003540392661731, |
|
"grad_norm": 2.8915371894836426, |
|
"learning_rate": 6.385070059755846e-05, |
|
"loss": 2.2941, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.7003540392661731, |
|
"eval_loss": 2.2414655685424805, |
|
"eval_runtime": 81.0572, |
|
"eval_samples_per_second": 3.023, |
|
"eval_steps_per_second": 0.382, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.7106533633730286, |
|
"grad_norm": 2.2395060062408447, |
|
"learning_rate": 5.983782790754623e-05, |
|
"loss": 2.2913, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.7209526874798842, |
|
"grad_norm": 2.083782911300659, |
|
"learning_rate": 5.592354622896944e-05, |
|
"loss": 2.2799, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7209526874798842, |
|
"eval_loss": 2.238328695297241, |
|
"eval_runtime": 47.7655, |
|
"eval_samples_per_second": 5.129, |
|
"eval_steps_per_second": 0.649, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7312520115867396, |
|
"grad_norm": 2.176422595977783, |
|
"learning_rate": 5.211213577137469e-05, |
|
"loss": 2.2807, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.7415513356935951, |
|
"grad_norm": 2.0646986961364746, |
|
"learning_rate": 4.840776425613886e-05, |
|
"loss": 2.2881, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.7415513356935951, |
|
"eval_loss": 2.235785484313965, |
|
"eval_runtime": 48.1216, |
|
"eval_samples_per_second": 5.091, |
|
"eval_steps_per_second": 0.644, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.7518506598004506, |
|
"grad_norm": 1.7966443300247192, |
|
"learning_rate": 4.481448235912671e-05, |
|
"loss": 2.2754, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.762149983907306, |
|
"grad_norm": 1.8745654821395874, |
|
"learning_rate": 4.133621928133665e-05, |
|
"loss": 2.2797, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.762149983907306, |
|
"eval_loss": 2.2380549907684326, |
|
"eval_runtime": 51.7197, |
|
"eval_samples_per_second": 4.737, |
|
"eval_steps_per_second": 0.599, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.7724493080141616, |
|
"grad_norm": 2.07661771774292, |
|
"learning_rate": 3.797677845237696e-05, |
|
"loss": 2.2368, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7827486321210171, |
|
"grad_norm": 2.3376801013946533, |
|
"learning_rate": 3.473983337147118e-05, |
|
"loss": 2.3197, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.7827486321210171, |
|
"eval_loss": 2.2254865169525146, |
|
"eval_runtime": 50.4873, |
|
"eval_samples_per_second": 4.853, |
|
"eval_steps_per_second": 0.614, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.7930479562278725, |
|
"grad_norm": 2.3710439205169678, |
|
"learning_rate": 3.162892359054098e-05, |
|
"loss": 2.2302, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.803347280334728, |
|
"grad_norm": 2.251873731613159, |
|
"learning_rate": 2.8647450843757897e-05, |
|
"loss": 2.2507, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.803347280334728, |
|
"eval_loss": 2.228350877761841, |
|
"eval_runtime": 84.236, |
|
"eval_samples_per_second": 2.908, |
|
"eval_steps_per_second": 0.368, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.8136466044415835, |
|
"grad_norm": 2.0786876678466797, |
|
"learning_rate": 2.5798675327796993e-05, |
|
"loss": 2.2143, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.823945928548439, |
|
"grad_norm": 2.0034406185150146, |
|
"learning_rate": 2.3085712136859668e-05, |
|
"loss": 2.236, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.823945928548439, |
|
"eval_loss": 2.2312803268432617, |
|
"eval_runtime": 81.5754, |
|
"eval_samples_per_second": 3.003, |
|
"eval_steps_per_second": 0.38, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8342452526552945, |
|
"grad_norm": 1.921857476234436, |
|
"learning_rate": 2.0511527856363912e-05, |
|
"loss": 2.2961, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.84454457676215, |
|
"grad_norm": 1.6842375993728638, |
|
"learning_rate": 1.8078937319026654e-05, |
|
"loss": 2.2667, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.84454457676215, |
|
"eval_loss": 2.2199668884277344, |
|
"eval_runtime": 10.4841, |
|
"eval_samples_per_second": 23.369, |
|
"eval_steps_per_second": 2.957, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.8548439008690055, |
|
"grad_norm": 1.776142954826355, |
|
"learning_rate": 1.579060052688548e-05, |
|
"loss": 2.2333, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.8651432249758609, |
|
"grad_norm": 2.0090060234069824, |
|
"learning_rate": 1.3649019742625623e-05, |
|
"loss": 2.2763, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.8651432249758609, |
|
"eval_loss": 2.2255115509033203, |
|
"eval_runtime": 10.47, |
|
"eval_samples_per_second": 23.4, |
|
"eval_steps_per_second": 2.961, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.8754425490827165, |
|
"grad_norm": 2.0487890243530273, |
|
"learning_rate": 1.1656536753392287e-05, |
|
"loss": 2.2732, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.885741873189572, |
|
"grad_norm": 1.6611812114715576, |
|
"learning_rate": 9.815330310080887e-06, |
|
"loss": 2.2915, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.885741873189572, |
|
"eval_loss": 2.2228875160217285, |
|
"eval_runtime": 10.4667, |
|
"eval_samples_per_second": 23.407, |
|
"eval_steps_per_second": 2.962, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.8960411972964274, |
|
"grad_norm": 1.7494527101516724, |
|
"learning_rate": 8.127413744904804e-06, |
|
"loss": 2.2629, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.9063405214032829, |
|
"grad_norm": 2.0750112533569336, |
|
"learning_rate": 6.594632769846353e-06, |
|
"loss": 2.2554, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.9063405214032829, |
|
"eval_loss": 2.221097230911255, |
|
"eval_runtime": 10.4622, |
|
"eval_samples_per_second": 23.418, |
|
"eval_steps_per_second": 2.963, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.9166398455101384, |
|
"grad_norm": 1.803432583808899, |
|
"learning_rate": 5.218663458397715e-06, |
|
"loss": 2.2508, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.9269391696169939, |
|
"grad_norm": 1.7050764560699463, |
|
"learning_rate": 4.001010412799138e-06, |
|
"loss": 2.2237, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9269391696169939, |
|
"eval_loss": 2.224059820175171, |
|
"eval_runtime": 10.5244, |
|
"eval_samples_per_second": 23.279, |
|
"eval_steps_per_second": 2.946, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9372384937238494, |
|
"grad_norm": 1.768523097038269, |
|
"learning_rate": 2.9430051187785962e-06, |
|
"loss": 2.2772, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.9475378178307049, |
|
"grad_norm": 1.7917652130126953, |
|
"learning_rate": 2.0458044895916513e-06, |
|
"loss": 2.2446, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.9475378178307049, |
|
"eval_loss": 2.2229297161102295, |
|
"eval_runtime": 10.4777, |
|
"eval_samples_per_second": 23.383, |
|
"eval_steps_per_second": 2.959, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.9578371419375603, |
|
"grad_norm": 1.6179791688919067, |
|
"learning_rate": 1.3103896009537207e-06, |
|
"loss": 2.2329, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.9681364660444158, |
|
"grad_norm": 1.8671883344650269, |
|
"learning_rate": 7.375646182482875e-07, |
|
"loss": 2.2926, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.9681364660444158, |
|
"eval_loss": 2.2224321365356445, |
|
"eval_runtime": 10.4643, |
|
"eval_samples_per_second": 23.413, |
|
"eval_steps_per_second": 2.962, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.9784357901512714, |
|
"grad_norm": 2.0502476692199707, |
|
"learning_rate": 3.2795591718381975e-07, |
|
"loss": 2.2313, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9887351142581268, |
|
"grad_norm": 1.8618476390838623, |
|
"learning_rate": 8.201139886109264e-08, |
|
"loss": 2.2813, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.9887351142581268, |
|
"eval_loss": 2.2218899726867676, |
|
"eval_runtime": 10.462, |
|
"eval_samples_per_second": 23.418, |
|
"eval_steps_per_second": 2.963, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.9990344383649823, |
|
"grad_norm": 1.9400445222854614, |
|
"learning_rate": 0.0, |
|
"loss": 2.2718, |
|
"step": 388 |
|
} |
|
], |
|
"logging_steps": 4, |
|
"max_steps": 388, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 8, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.087198145554678e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|