|
{ |
|
"best_metric": 2.2854506969451904, |
|
"best_model_checkpoint": "/home/datta0/models/lora_final/Mistral-7B-v0.3_pct_ortho/checkpoint-8", |
|
"epoch": 0.9980657640232108, |
|
"eval_steps": 8, |
|
"global_step": 387, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0025789813023855577, |
|
"grad_norm": 10.031487464904785, |
|
"learning_rate": 3.75e-05, |
|
"loss": 2.128, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.010315925209542231, |
|
"grad_norm": 7.364378929138184, |
|
"learning_rate": 0.00015, |
|
"loss": 2.0612, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.020631850419084462, |
|
"grad_norm": 23.759828567504883, |
|
"learning_rate": 0.0003, |
|
"loss": 2.1243, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.020631850419084462, |
|
"eval_loss": 2.2854506969451904, |
|
"eval_runtime": 11.1093, |
|
"eval_samples_per_second": 22.054, |
|
"eval_steps_per_second": 2.79, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.030947775628626693, |
|
"grad_norm": 133.17942810058594, |
|
"learning_rate": 0.00029991755529206284, |
|
"loss": 2.465, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.041263700838168924, |
|
"grad_norm": 254.61338806152344, |
|
"learning_rate": 0.0002996703117966496, |
|
"loss": 9.1803, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.041263700838168924, |
|
"eval_loss": 12.813275337219238, |
|
"eval_runtime": 10.7499, |
|
"eval_samples_per_second": 22.791, |
|
"eval_steps_per_second": 2.884, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.05157962604771115, |
|
"grad_norm": 87.40137481689453, |
|
"learning_rate": 0.00029925854129933066, |
|
"loss": 12.8875, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.061895551257253385, |
|
"grad_norm": 22.197757720947266, |
|
"learning_rate": 0.0002986826964440844, |
|
"loss": 9.137, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.061895551257253385, |
|
"eval_loss": 8.456737518310547, |
|
"eval_runtime": 10.6838, |
|
"eval_samples_per_second": 22.932, |
|
"eval_steps_per_second": 2.902, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.07221147646679561, |
|
"grad_norm": 11.230401992797852, |
|
"learning_rate": 0.00029794341023572295, |
|
"loss": 7.9575, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.08252740167633785, |
|
"grad_norm": 19.19782066345215, |
|
"learning_rate": 0.0002970414953440533, |
|
"loss": 8.3645, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.08252740167633785, |
|
"eval_loss": 8.26063346862793, |
|
"eval_runtime": 10.699, |
|
"eval_samples_per_second": 22.899, |
|
"eval_steps_per_second": 2.897, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.09284332688588008, |
|
"grad_norm": 13.250898361206055, |
|
"learning_rate": 0.00029597794321054006, |
|
"loss": 8.0337, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.1031592520954223, |
|
"grad_norm": 15.708863258361816, |
|
"learning_rate": 0.00029475392295845, |
|
"loss": 8.5251, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1031592520954223, |
|
"eval_loss": 7.756398677825928, |
|
"eval_runtime": 10.6841, |
|
"eval_samples_per_second": 22.931, |
|
"eval_steps_per_second": 2.901, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.11347517730496454, |
|
"grad_norm": 8.025888442993164, |
|
"learning_rate": 0.0002933707801076791, |
|
"loss": 7.708, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.12379110251450677, |
|
"grad_norm": 72.3178482055664, |
|
"learning_rate": 0.00029183003509567217, |
|
"loss": 9.4881, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.12379110251450677, |
|
"eval_loss": 9.306961059570312, |
|
"eval_runtime": 10.6939, |
|
"eval_samples_per_second": 22.91, |
|
"eval_steps_per_second": 2.899, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.134107027724049, |
|
"grad_norm": 13.741963386535645, |
|
"learning_rate": 0.000290133381606063, |
|
"loss": 8.36, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.14442295293359123, |
|
"grad_norm": 18.59068489074707, |
|
"learning_rate": 0.0002882826847068703, |
|
"loss": 7.7111, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.14442295293359123, |
|
"eval_loss": 7.73629903793335, |
|
"eval_runtime": 10.7184, |
|
"eval_samples_per_second": 22.858, |
|
"eval_steps_per_second": 2.892, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.15473887814313347, |
|
"grad_norm": 20.837697982788086, |
|
"learning_rate": 0.00028627997880029875, |
|
"loss": 7.6612, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1650548033526757, |
|
"grad_norm": 8.735236167907715, |
|
"learning_rate": 0.0002841274653863955, |
|
"loss": 7.6126, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.1650548033526757, |
|
"eval_loss": 7.594820022583008, |
|
"eval_runtime": 10.664, |
|
"eval_samples_per_second": 22.974, |
|
"eval_steps_per_second": 2.907, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.17537072856221791, |
|
"grad_norm": 12.856707572937012, |
|
"learning_rate": 0.00028182751064302397, |
|
"loss": 7.5332, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.18568665377176016, |
|
"grad_norm": 13.578782081604004, |
|
"learning_rate": 0.0002793826428248118, |
|
"loss": 7.6789, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.18568665377176016, |
|
"eval_loss": 7.6147379875183105, |
|
"eval_runtime": 10.6601, |
|
"eval_samples_per_second": 22.983, |
|
"eval_steps_per_second": 2.908, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.19600257898130238, |
|
"grad_norm": 7.203093528747559, |
|
"learning_rate": 0.0002767955494839353, |
|
"loss": 7.551, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.2063185041908446, |
|
"grad_norm": 5.764775276184082, |
|
"learning_rate": 0.00027406907451579294, |
|
"loss": 7.7404, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2063185041908446, |
|
"eval_loss": 7.632232666015625, |
|
"eval_runtime": 10.6044, |
|
"eval_samples_per_second": 23.104, |
|
"eval_steps_per_second": 2.923, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.21663442940038685, |
|
"grad_norm": 15.057692527770996, |
|
"learning_rate": 0.0002712062150328175, |
|
"loss": 7.7502, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.22695035460992907, |
|
"grad_norm": 7.761707305908203, |
|
"learning_rate": 0.0002682101180698615, |
|
"loss": 7.7173, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.22695035460992907, |
|
"eval_loss": 7.6740193367004395, |
|
"eval_runtime": 10.575, |
|
"eval_samples_per_second": 23.168, |
|
"eval_steps_per_second": 2.931, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.23726627981947132, |
|
"grad_norm": 10.52561092376709, |
|
"learning_rate": 0.000265084077124779, |
|
"loss": 7.593, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.24758220502901354, |
|
"grad_norm": 13.96662425994873, |
|
"learning_rate": 0.0002618315285380063, |
|
"loss": 7.7113, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.24758220502901354, |
|
"eval_loss": 7.674198627471924, |
|
"eval_runtime": 10.533, |
|
"eval_samples_per_second": 23.26, |
|
"eval_steps_per_second": 2.943, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.2578981302385558, |
|
"grad_norm": 5.200709819793701, |
|
"learning_rate": 0.00025845604771512044, |
|
"loss": 7.698, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.268214055448098, |
|
"grad_norm": 5.996535301208496, |
|
"learning_rate": 0.00025496134519652946, |
|
"loss": 7.6961, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.268214055448098, |
|
"eval_loss": 7.642160415649414, |
|
"eval_runtime": 10.4668, |
|
"eval_samples_per_second": 23.407, |
|
"eval_steps_per_second": 2.962, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.27852998065764023, |
|
"grad_norm": 5.231250286102295, |
|
"learning_rate": 0.00025135126257861296, |
|
"loss": 7.7243, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.28884590586718245, |
|
"grad_norm": 4.705758094787598, |
|
"learning_rate": 0.00024762976829079836, |
|
"loss": 7.6729, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.28884590586718245, |
|
"eval_loss": 7.607565402984619, |
|
"eval_runtime": 96.7081, |
|
"eval_samples_per_second": 2.533, |
|
"eval_steps_per_second": 0.321, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.29916183107672467, |
|
"grad_norm": 5.811936855316162, |
|
"learning_rate": 0.00024380095323321433, |
|
"loss": 7.6601, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.30947775628626695, |
|
"grad_norm": 6.612433433532715, |
|
"learning_rate": 0.00023986902627971652, |
|
"loss": 7.7225, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.30947775628626695, |
|
"eval_loss": 7.7170515060424805, |
|
"eval_runtime": 102.1676, |
|
"eval_samples_per_second": 2.398, |
|
"eval_steps_per_second": 0.303, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.31979368149580917, |
|
"grad_norm": 22.143421173095703, |
|
"learning_rate": 0.00023583830965122902, |
|
"loss": 7.7705, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.3301096067053514, |
|
"grad_norm": 14.632792472839355, |
|
"learning_rate": 0.000231713234164488, |
|
"loss": 7.8259, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.3301096067053514, |
|
"eval_loss": 7.772371292114258, |
|
"eval_runtime": 92.9761, |
|
"eval_samples_per_second": 2.635, |
|
"eval_steps_per_second": 0.333, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.3404255319148936, |
|
"grad_norm": 12.084867477416992, |
|
"learning_rate": 0.00022749833436140976, |
|
"loss": 7.753, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.35074145712443583, |
|
"grad_norm": 9.957462310791016, |
|
"learning_rate": 0.00022319824352443768, |
|
"loss": 7.6611, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.35074145712443583, |
|
"eval_loss": 7.5973920822143555, |
|
"eval_runtime": 108.0726, |
|
"eval_samples_per_second": 2.267, |
|
"eval_steps_per_second": 0.287, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.3610573823339781, |
|
"grad_norm": 4.924723148345947, |
|
"learning_rate": 0.0002188176885833471, |
|
"loss": 7.6173, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3713733075435203, |
|
"grad_norm": 9.477831840515137, |
|
"learning_rate": 0.0002143614849191077, |
|
"loss": 7.5696, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.3713733075435203, |
|
"eval_loss": 7.603182792663574, |
|
"eval_runtime": 91.6779, |
|
"eval_samples_per_second": 2.672, |
|
"eval_steps_per_second": 0.338, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.38168923275306255, |
|
"grad_norm": 7.13740348815918, |
|
"learning_rate": 0.00020983453107051425, |
|
"loss": 7.6439, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.39200515796260477, |
|
"grad_norm": 10.324462890625, |
|
"learning_rate": 0.00020524180334940528, |
|
"loss": 7.6786, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.39200515796260477, |
|
"eval_loss": 7.616292476654053, |
|
"eval_runtime": 106.0137, |
|
"eval_samples_per_second": 2.311, |
|
"eval_steps_per_second": 0.292, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.402321083172147, |
|
"grad_norm": 6.7738938331604, |
|
"learning_rate": 0.00020058835037038873, |
|
"loss": 7.6126, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.4126370083816892, |
|
"grad_norm": 4.637758255004883, |
|
"learning_rate": 0.00019587928750108816, |
|
"loss": 7.4746, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4126370083816892, |
|
"eval_loss": 7.426816463470459, |
|
"eval_runtime": 104.9303, |
|
"eval_samples_per_second": 2.335, |
|
"eval_steps_per_second": 0.295, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4229529335912315, |
|
"grad_norm": 10.853010177612305, |
|
"learning_rate": 0.000191119791239009, |
|
"loss": 7.4631, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.4332688588007737, |
|
"grad_norm": 17.25995635986328, |
|
"learning_rate": 0.000186315093521208, |
|
"loss": 7.4383, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.4332688588007737, |
|
"eval_loss": 7.406857013702393, |
|
"eval_runtime": 10.9396, |
|
"eval_samples_per_second": 22.396, |
|
"eval_steps_per_second": 2.834, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.4435847840103159, |
|
"grad_norm": 48.24932098388672, |
|
"learning_rate": 0.00018147047597301952, |
|
"loss": 7.5079, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.45390070921985815, |
|
"grad_norm": 10.03960132598877, |
|
"learning_rate": 0.00017659126410216118, |
|
"loss": 7.4469, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.45390070921985815, |
|
"eval_loss": 7.522530555725098, |
|
"eval_runtime": 11.0052, |
|
"eval_samples_per_second": 22.262, |
|
"eval_steps_per_second": 2.817, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.46421663442940037, |
|
"grad_norm": 37.15389633178711, |
|
"learning_rate": 0.00017168282144460167, |
|
"loss": 7.4621, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.47453255963894264, |
|
"grad_norm": 17.32857894897461, |
|
"learning_rate": 0.00016675054366862551, |
|
"loss": 7.6465, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.47453255963894264, |
|
"eval_loss": 7.484851837158203, |
|
"eval_runtime": 11.0169, |
|
"eval_samples_per_second": 22.239, |
|
"eval_steps_per_second": 2.814, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.48484848484848486, |
|
"grad_norm": 8.746517181396484, |
|
"learning_rate": 0.0001617998526435754, |
|
"loss": 7.5273, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.4951644100580271, |
|
"grad_norm": 10.883841514587402, |
|
"learning_rate": 0.00015683619047979322, |
|
"loss": 7.4025, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.4951644100580271, |
|
"eval_loss": 7.30989933013916, |
|
"eval_runtime": 10.9893, |
|
"eval_samples_per_second": 22.294, |
|
"eval_steps_per_second": 2.821, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.5054803352675693, |
|
"grad_norm": 33.5476188659668, |
|
"learning_rate": 0.000151865013546311, |
|
"loss": 7.4748, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.5157962604771116, |
|
"grad_norm": 9.489357948303223, |
|
"learning_rate": 0.00014689178647286702, |
|
"loss": 7.3473, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5157962604771116, |
|
"eval_loss": 7.262345314025879, |
|
"eval_runtime": 11.0031, |
|
"eval_samples_per_second": 22.267, |
|
"eval_steps_per_second": 2.817, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5261121856866537, |
|
"grad_norm": 17.06060791015625, |
|
"learning_rate": 0.00014192197614284246, |
|
"loss": 7.3059, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.536428110896196, |
|
"grad_norm": 6.841980934143066, |
|
"learning_rate": 0.00013696104568371937, |
|
"loss": 7.2821, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.536428110896196, |
|
"eval_loss": 7.248399257659912, |
|
"eval_runtime": 10.9335, |
|
"eval_samples_per_second": 22.408, |
|
"eval_steps_per_second": 2.835, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.5467440361057382, |
|
"grad_norm": 30.315967559814453, |
|
"learning_rate": 0.00013201444846166842, |
|
"loss": 7.2134, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.5570599613152805, |
|
"grad_norm": 58.75193786621094, |
|
"learning_rate": 0.00012708762208686638, |
|
"loss": 7.389, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.5570599613152805, |
|
"eval_loss": 7.7176690101623535, |
|
"eval_runtime": 11.0125, |
|
"eval_samples_per_second": 22.247, |
|
"eval_steps_per_second": 2.815, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.5673758865248227, |
|
"grad_norm": 12.3959379196167, |
|
"learning_rate": 0.00012218598243613358, |
|
"loss": 7.7131, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5776918117343649, |
|
"grad_norm": 6.3016510009765625, |
|
"learning_rate": 0.00011731491769946225, |
|
"loss": 7.2912, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.5776918117343649, |
|
"eval_loss": 7.1140642166137695, |
|
"eval_runtime": 10.9151, |
|
"eval_samples_per_second": 22.446, |
|
"eval_steps_per_second": 2.84, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.5880077369439072, |
|
"grad_norm": 12.621047019958496, |
|
"learning_rate": 0.00011247978245697986, |
|
"loss": 7.1747, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.5983236621534493, |
|
"grad_norm": 11.38538646697998, |
|
"learning_rate": 0.00010768589179285843, |
|
"loss": 7.1847, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.5983236621534493, |
|
"eval_loss": 7.114475250244141, |
|
"eval_runtime": 10.877, |
|
"eval_samples_per_second": 22.525, |
|
"eval_steps_per_second": 2.85, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.6086395873629916, |
|
"grad_norm": 14.617621421813965, |
|
"learning_rate": 0.0001029385154526404, |
|
"loss": 7.1773, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.6189555125725339, |
|
"grad_norm": 8.394628524780273, |
|
"learning_rate": 9.824287205040372e-05, |
|
"loss": 7.2121, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6189555125725339, |
|
"eval_loss": 7.146487236022949, |
|
"eval_runtime": 10.8983, |
|
"eval_samples_per_second": 22.481, |
|
"eval_steps_per_second": 2.844, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6292714377820761, |
|
"grad_norm": 9.852251052856445, |
|
"learning_rate": 9.360412333213324e-05, |
|
"loss": 7.2931, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.6395873629916183, |
|
"grad_norm": 9.686322212219238, |
|
"learning_rate": 8.902736850160597e-05, |
|
"loss": 7.1216, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.6395873629916183, |
|
"eval_loss": 7.147933006286621, |
|
"eval_runtime": 10.9167, |
|
"eval_samples_per_second": 22.443, |
|
"eval_steps_per_second": 2.84, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.6499032882011605, |
|
"grad_norm": 13.205273628234863, |
|
"learning_rate": 8.451763861502603e-05, |
|
"loss": 7.4023, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.6602192134107028, |
|
"grad_norm": 7.406120300292969, |
|
"learning_rate": 8.007989105057155e-05, |
|
"loss": 7.2503, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.6602192134107028, |
|
"eval_loss": 7.110508441925049, |
|
"eval_runtime": 10.8603, |
|
"eval_samples_per_second": 22.559, |
|
"eval_steps_per_second": 2.854, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.670535138620245, |
|
"grad_norm": 7.229670524597168, |
|
"learning_rate": 7.571900405893403e-05, |
|
"loss": 7.2731, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6808510638297872, |
|
"grad_norm": 13.080286979675293, |
|
"learning_rate": 7.143977140083847e-05, |
|
"loss": 7.1416, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.6808510638297872, |
|
"eval_loss": 7.173031806945801, |
|
"eval_runtime": 10.8292, |
|
"eval_samples_per_second": 22.624, |
|
"eval_steps_per_second": 2.863, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.6911669890393295, |
|
"grad_norm": 7.729543685913086, |
|
"learning_rate": 6.724689707744056e-05, |
|
"loss": 7.1927, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.7014829142488717, |
|
"grad_norm": 6.9558587074279785, |
|
"learning_rate": 6.314499015939392e-05, |
|
"loss": 7.2288, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.7014829142488717, |
|
"eval_loss": 7.149050235748291, |
|
"eval_runtime": 101.2615, |
|
"eval_samples_per_second": 2.419, |
|
"eval_steps_per_second": 0.306, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.7117988394584139, |
|
"grad_norm": 37.099365234375, |
|
"learning_rate": 5.913855972026981e-05, |
|
"loss": 7.3082, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.7221147646679562, |
|
"grad_norm": 20.466567993164062, |
|
"learning_rate": 5.52320098799004e-05, |
|
"loss": 7.3502, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7221147646679562, |
|
"eval_loss": 7.199110984802246, |
|
"eval_runtime": 101.1627, |
|
"eval_samples_per_second": 2.422, |
|
"eval_steps_per_second": 0.306, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7324306898774984, |
|
"grad_norm": 9.72053337097168, |
|
"learning_rate": 5.1429634963094275e-05, |
|
"loss": 7.1518, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.7427466150870407, |
|
"grad_norm": 8.377337455749512, |
|
"learning_rate": 4.773561477904451e-05, |
|
"loss": 7.2648, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.7427466150870407, |
|
"eval_loss": 7.140583038330078, |
|
"eval_runtime": 102.1903, |
|
"eval_samples_per_second": 2.397, |
|
"eval_steps_per_second": 0.303, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.7530625402965828, |
|
"grad_norm": 11.30972671508789, |
|
"learning_rate": 4.415401002662016e-05, |
|
"loss": 7.2351, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.7633784655061251, |
|
"grad_norm": 11.135313987731934, |
|
"learning_rate": 4.068875783059153e-05, |
|
"loss": 7.1647, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.7633784655061251, |
|
"eval_loss": 7.122550964355469, |
|
"eval_runtime": 94.8476, |
|
"eval_samples_per_second": 2.583, |
|
"eval_steps_per_second": 0.327, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.7736943907156673, |
|
"grad_norm": 3.4541268348693848, |
|
"learning_rate": 3.734366741369488e-05, |
|
"loss": 7.0893, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7840103159252095, |
|
"grad_norm": 9.31264591217041, |
|
"learning_rate": 3.4122415909296155e-05, |
|
"loss": 7.1678, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.7840103159252095, |
|
"eval_loss": 7.084330081939697, |
|
"eval_runtime": 97.4225, |
|
"eval_samples_per_second": 2.515, |
|
"eval_steps_per_second": 0.318, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.7943262411347518, |
|
"grad_norm": 4.439526081085205, |
|
"learning_rate": 3.1028544319255097e-05, |
|
"loss": 7.1907, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.804642166344294, |
|
"grad_norm": 7.1955766677856445, |
|
"learning_rate": 2.8065453621433405e-05, |
|
"loss": 7.1879, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.804642166344294, |
|
"eval_loss": 7.1045451164245605, |
|
"eval_runtime": 98.3747, |
|
"eval_samples_per_second": 2.49, |
|
"eval_steps_per_second": 0.315, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.8149580915538363, |
|
"grad_norm": 7.633655548095703, |
|
"learning_rate": 2.5236401031126713e-05, |
|
"loss": 7.1379, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.8252740167633784, |
|
"grad_norm": 6.028517246246338, |
|
"learning_rate": 2.2544496420529107e-05, |
|
"loss": 7.2384, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8252740167633784, |
|
"eval_loss": 7.113685607910156, |
|
"eval_runtime": 104.0901, |
|
"eval_samples_per_second": 2.354, |
|
"eval_steps_per_second": 0.298, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8355899419729207, |
|
"grad_norm": 13.628016471862793, |
|
"learning_rate": 1.9992698900165983e-05, |
|
"loss": 7.0224, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.845905867182463, |
|
"grad_norm": 9.757091522216797, |
|
"learning_rate": 1.7583813566054454e-05, |
|
"loss": 7.2301, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.845905867182463, |
|
"eval_loss": 7.094879150390625, |
|
"eval_runtime": 11.0422, |
|
"eval_samples_per_second": 22.188, |
|
"eval_steps_per_second": 2.807, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.8562217923920051, |
|
"grad_norm": 12.490655899047852, |
|
"learning_rate": 1.5320488416165604e-05, |
|
"loss": 7.2313, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.8665377176015474, |
|
"grad_norm": 17.20357894897461, |
|
"learning_rate": 1.3205211439578656e-05, |
|
"loss": 7.2897, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.8665377176015474, |
|
"eval_loss": 7.127319812774658, |
|
"eval_runtime": 11.0618, |
|
"eval_samples_per_second": 22.148, |
|
"eval_steps_per_second": 2.802, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.8768536428110896, |
|
"grad_norm": 9.043377876281738, |
|
"learning_rate": 1.1240307881527516e-05, |
|
"loss": 7.0591, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8871695680206318, |
|
"grad_norm": 6.038938522338867, |
|
"learning_rate": 9.427937687345144e-06, |
|
"loss": 7.1483, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.8871695680206318, |
|
"eval_loss": 7.108406066894531, |
|
"eval_runtime": 11.0489, |
|
"eval_samples_per_second": 22.174, |
|
"eval_steps_per_second": 2.806, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.8974854932301741, |
|
"grad_norm": 6.139174461364746, |
|
"learning_rate": 7.770093128115911e-06, |
|
"loss": 7.1885, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.9078014184397163, |
|
"grad_norm": 9.17184829711914, |
|
"learning_rate": 6.268596610646382e-06, |
|
"loss": 7.1119, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.9078014184397163, |
|
"eval_loss": 7.098378658294678, |
|
"eval_runtime": 11.0353, |
|
"eval_samples_per_second": 22.201, |
|
"eval_steps_per_second": 2.809, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.9181173436492586, |
|
"grad_norm": 9.084051132202148, |
|
"learning_rate": 4.9250986741612985e-06, |
|
"loss": 7.0772, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.9284332688588007, |
|
"grad_norm": 4.8953070640563965, |
|
"learning_rate": 3.7410761759270934e-06, |
|
"loss": 7.2202, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9284332688588007, |
|
"eval_loss": 7.076570510864258, |
|
"eval_runtime": 11.0328, |
|
"eval_samples_per_second": 22.206, |
|
"eval_steps_per_second": 2.81, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.938749194068343, |
|
"grad_norm": 3.9921817779541016, |
|
"learning_rate": 2.7178306677978767e-06, |
|
"loss": 7.1844, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.9490651192778853, |
|
"grad_norm": 6.253467559814453, |
|
"learning_rate": 1.8564869654679181e-06, |
|
"loss": 7.1149, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.9490651192778853, |
|
"eval_loss": 7.073785781860352, |
|
"eval_runtime": 10.9871, |
|
"eval_samples_per_second": 22.299, |
|
"eval_steps_per_second": 2.821, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.9593810444874274, |
|
"grad_norm": 5.679617881774902, |
|
"learning_rate": 1.157991912003453e-06, |
|
"loss": 7.204, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.9696969696969697, |
|
"grad_norm": 10.825676918029785, |
|
"learning_rate": 6.231133370135422e-07, |
|
"loss": 7.1986, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.9696969696969697, |
|
"eval_loss": 7.074854850769043, |
|
"eval_runtime": 11.0251, |
|
"eval_samples_per_second": 22.222, |
|
"eval_steps_per_second": 2.812, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.9800128949065119, |
|
"grad_norm": 3.1997151374816895, |
|
"learning_rate": 2.524392126034891e-07, |
|
"loss": 7.289, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9903288201160542, |
|
"grad_norm": 5.230665683746338, |
|
"learning_rate": 4.63770070389724e-08, |
|
"loss": 7.155, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.9903288201160542, |
|
"eval_loss": 7.072939872741699, |
|
"eval_runtime": 11.0357, |
|
"eval_samples_per_second": 22.201, |
|
"eval_steps_per_second": 2.809, |
|
"step": 384 |
|
} |
|
], |
|
"logging_steps": 4, |
|
"max_steps": 387, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 8, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.288233513590456e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|