|
{ |
|
"best_metric": 1.805204153060913, |
|
"best_model_checkpoint": "/home/sr2464/scratch/C2S_Files/C2S_training_runs/multicell_v2_pretraining_runs/finetuning-EleutherAI/pythia-410m-multicell_v2_pretraining-2024-07-28_14-10-44/checkpoint-7000", |
|
"epoch": 0.024296135179068302, |
|
"eval_steps": 100, |
|
"global_step": 7000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00034708764541526143, |
|
"grad_norm": 12.638826370239258, |
|
"learning_rate": 3.470776065528252e-07, |
|
"loss": 4.1006, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.00034708764541526143, |
|
"eval_loss": 3.750605821609497, |
|
"eval_runtime": 28.5808, |
|
"eval_samples_per_second": 16.69, |
|
"eval_steps_per_second": 0.35, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0006941752908305229, |
|
"grad_norm": 7.417098522186279, |
|
"learning_rate": 6.941552131056504e-07, |
|
"loss": 3.4941, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.0006941752908305229, |
|
"eval_loss": 3.152513265609741, |
|
"eval_runtime": 27.3056, |
|
"eval_samples_per_second": 17.469, |
|
"eval_steps_per_second": 0.366, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.0010412629362457843, |
|
"grad_norm": 5.065331935882568, |
|
"learning_rate": 1.0412328196584756e-06, |
|
"loss": 3.0197, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.0010412629362457843, |
|
"eval_loss": 2.8438754081726074, |
|
"eval_runtime": 27.3207, |
|
"eval_samples_per_second": 17.459, |
|
"eval_steps_per_second": 0.366, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.0013883505816610457, |
|
"grad_norm": 5.113323211669922, |
|
"learning_rate": 1.3883104262113009e-06, |
|
"loss": 2.7915, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.0013883505816610457, |
|
"eval_loss": 2.6905336380004883, |
|
"eval_runtime": 27.6367, |
|
"eval_samples_per_second": 17.26, |
|
"eval_steps_per_second": 0.362, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.0017354382270763071, |
|
"grad_norm": 6.473913192749023, |
|
"learning_rate": 1.7353880327641261e-06, |
|
"loss": 2.6697, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.0017354382270763071, |
|
"eval_loss": 2.609987735748291, |
|
"eval_runtime": 27.5233, |
|
"eval_samples_per_second": 17.331, |
|
"eval_steps_per_second": 0.363, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.0020825258724915686, |
|
"grad_norm": 6.615407466888428, |
|
"learning_rate": 2.082465639316951e-06, |
|
"loss": 2.5959, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.0020825258724915686, |
|
"eval_loss": 2.5381109714508057, |
|
"eval_runtime": 27.411, |
|
"eval_samples_per_second": 17.402, |
|
"eval_steps_per_second": 0.365, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.00242961351790683, |
|
"grad_norm": 8.234296798706055, |
|
"learning_rate": 2.429543245869777e-06, |
|
"loss": 2.5239, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.00242961351790683, |
|
"eval_loss": 2.4786229133605957, |
|
"eval_runtime": 27.9876, |
|
"eval_samples_per_second": 17.043, |
|
"eval_steps_per_second": 0.357, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.0027767011633220914, |
|
"grad_norm": 3.987603187561035, |
|
"learning_rate": 2.7766208524226017e-06, |
|
"loss": 2.4544, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.0027767011633220914, |
|
"eval_loss": 2.4083495140075684, |
|
"eval_runtime": 27.5819, |
|
"eval_samples_per_second": 17.294, |
|
"eval_steps_per_second": 0.363, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.003123788808737353, |
|
"grad_norm": 3.854851484298706, |
|
"learning_rate": 3.123698458975427e-06, |
|
"loss": 2.3861, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.003123788808737353, |
|
"eval_loss": 2.336987257003784, |
|
"eval_runtime": 27.6168, |
|
"eval_samples_per_second": 17.272, |
|
"eval_steps_per_second": 0.362, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.0034708764541526143, |
|
"grad_norm": 3.7287888526916504, |
|
"learning_rate": 3.4707760655282523e-06, |
|
"loss": 2.3199, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.0034708764541526143, |
|
"eval_loss": 2.2719526290893555, |
|
"eval_runtime": 26.9345, |
|
"eval_samples_per_second": 17.71, |
|
"eval_steps_per_second": 0.371, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.0038179640995678757, |
|
"grad_norm": 3.5708882808685303, |
|
"learning_rate": 3.817853672081077e-06, |
|
"loss": 2.2467, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.0038179640995678757, |
|
"eval_loss": 2.206721067428589, |
|
"eval_runtime": 27.0387, |
|
"eval_samples_per_second": 17.641, |
|
"eval_steps_per_second": 0.37, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.004165051744983137, |
|
"grad_norm": 3.284808397293091, |
|
"learning_rate": 4.164931278633902e-06, |
|
"loss": 2.2017, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.004165051744983137, |
|
"eval_loss": 2.1687662601470947, |
|
"eval_runtime": 26.9942, |
|
"eval_samples_per_second": 17.67, |
|
"eval_steps_per_second": 0.37, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.004512139390398399, |
|
"grad_norm": 3.306734323501587, |
|
"learning_rate": 4.5120088851867285e-06, |
|
"loss": 2.1612, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.004512139390398399, |
|
"eval_loss": 2.129286766052246, |
|
"eval_runtime": 27.1243, |
|
"eval_samples_per_second": 17.586, |
|
"eval_steps_per_second": 0.369, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.00485922703581366, |
|
"grad_norm": 4.48453950881958, |
|
"learning_rate": 4.859086491739554e-06, |
|
"loss": 2.1302, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.00485922703581366, |
|
"eval_loss": 2.1066970825195312, |
|
"eval_runtime": 26.7667, |
|
"eval_samples_per_second": 17.821, |
|
"eval_steps_per_second": 0.374, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.0052063146812289214, |
|
"grad_norm": 3.936067819595337, |
|
"learning_rate": 5.206164098292378e-06, |
|
"loss": 2.1019, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.0052063146812289214, |
|
"eval_loss": 2.079268217086792, |
|
"eval_runtime": 26.9038, |
|
"eval_samples_per_second": 17.73, |
|
"eval_steps_per_second": 0.372, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.005553402326644183, |
|
"grad_norm": 3.049617290496826, |
|
"learning_rate": 5.5532417048452035e-06, |
|
"loss": 2.0839, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.005553402326644183, |
|
"eval_loss": 2.063786268234253, |
|
"eval_runtime": 26.8676, |
|
"eval_samples_per_second": 17.754, |
|
"eval_steps_per_second": 0.372, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.005900489972059444, |
|
"grad_norm": 2.732224464416504, |
|
"learning_rate": 5.900319311398029e-06, |
|
"loss": 2.0707, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.005900489972059444, |
|
"eval_loss": 2.046088695526123, |
|
"eval_runtime": 26.9915, |
|
"eval_samples_per_second": 17.672, |
|
"eval_steps_per_second": 0.37, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.006247577617474706, |
|
"grad_norm": 2.1865687370300293, |
|
"learning_rate": 6.247396917950854e-06, |
|
"loss": 2.0527, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.006247577617474706, |
|
"eval_loss": 2.031470537185669, |
|
"eval_runtime": 26.9114, |
|
"eval_samples_per_second": 17.725, |
|
"eval_steps_per_second": 0.372, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.006594665262889967, |
|
"grad_norm": 2.854724407196045, |
|
"learning_rate": 6.59447452450368e-06, |
|
"loss": 2.0407, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.006594665262889967, |
|
"eval_loss": 2.0221400260925293, |
|
"eval_runtime": 26.7594, |
|
"eval_samples_per_second": 17.825, |
|
"eval_steps_per_second": 0.374, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.006941752908305229, |
|
"grad_norm": 2.716592788696289, |
|
"learning_rate": 6.9415521310565046e-06, |
|
"loss": 2.0369, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.006941752908305229, |
|
"eval_loss": 2.0095324516296387, |
|
"eval_runtime": 26.7666, |
|
"eval_samples_per_second": 17.821, |
|
"eval_steps_per_second": 0.374, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.00728884055372049, |
|
"grad_norm": 3.099235773086548, |
|
"learning_rate": 7.28862973760933e-06, |
|
"loss": 2.0172, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.00728884055372049, |
|
"eval_loss": 1.9926340579986572, |
|
"eval_runtime": 26.7511, |
|
"eval_samples_per_second": 17.831, |
|
"eval_steps_per_second": 0.374, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.0076359281991357515, |
|
"grad_norm": 2.7901623249053955, |
|
"learning_rate": 7.635707344162154e-06, |
|
"loss": 2.0011, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.0076359281991357515, |
|
"eval_loss": 1.9787664413452148, |
|
"eval_runtime": 26.8686, |
|
"eval_samples_per_second": 17.753, |
|
"eval_steps_per_second": 0.372, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.007983015844551014, |
|
"grad_norm": 1.9205849170684814, |
|
"learning_rate": 7.982784950714981e-06, |
|
"loss": 1.9848, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.007983015844551014, |
|
"eval_loss": 1.9650758504867554, |
|
"eval_runtime": 26.6567, |
|
"eval_samples_per_second": 17.894, |
|
"eval_steps_per_second": 0.375, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.008330103489966274, |
|
"grad_norm": 2.2833688259124756, |
|
"learning_rate": 8.329862557267805e-06, |
|
"loss": 1.9804, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.008330103489966274, |
|
"eval_loss": 1.95713472366333, |
|
"eval_runtime": 26.6415, |
|
"eval_samples_per_second": 17.904, |
|
"eval_steps_per_second": 0.375, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.008677191135381537, |
|
"grad_norm": 2.6494622230529785, |
|
"learning_rate": 8.67694016382063e-06, |
|
"loss": 1.9622, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.008677191135381537, |
|
"eval_loss": 1.949569821357727, |
|
"eval_runtime": 26.7134, |
|
"eval_samples_per_second": 17.856, |
|
"eval_steps_per_second": 0.374, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.009024278780796797, |
|
"grad_norm": 1.9379554986953735, |
|
"learning_rate": 9.024017770373457e-06, |
|
"loss": 1.9624, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.009024278780796797, |
|
"eval_loss": 1.9367306232452393, |
|
"eval_runtime": 26.6328, |
|
"eval_samples_per_second": 17.91, |
|
"eval_steps_per_second": 0.375, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.00937136642621206, |
|
"grad_norm": 1.797877311706543, |
|
"learning_rate": 9.37109537692628e-06, |
|
"loss": 1.9466, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.00937136642621206, |
|
"eval_loss": 1.9342317581176758, |
|
"eval_runtime": 26.6027, |
|
"eval_samples_per_second": 17.931, |
|
"eval_steps_per_second": 0.376, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.00971845407162732, |
|
"grad_norm": 2.123497724533081, |
|
"learning_rate": 9.718172983479108e-06, |
|
"loss": 1.9421, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.00971845407162732, |
|
"eval_loss": 1.9269955158233643, |
|
"eval_runtime": 26.3282, |
|
"eval_samples_per_second": 18.117, |
|
"eval_steps_per_second": 0.38, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.010065541717042582, |
|
"grad_norm": 2.386960029602051, |
|
"learning_rate": 1.0065250590031931e-05, |
|
"loss": 1.9395, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.010065541717042582, |
|
"eval_loss": 1.9186092615127563, |
|
"eval_runtime": 26.5488, |
|
"eval_samples_per_second": 17.967, |
|
"eval_steps_per_second": 0.377, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.010412629362457843, |
|
"grad_norm": 1.994126558303833, |
|
"learning_rate": 1.0412328196584756e-05, |
|
"loss": 1.9242, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.010412629362457843, |
|
"eval_loss": 1.9123972654342651, |
|
"eval_runtime": 26.4618, |
|
"eval_samples_per_second": 18.026, |
|
"eval_steps_per_second": 0.378, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.010759717007873105, |
|
"grad_norm": 1.7175105810165405, |
|
"learning_rate": 1.0759405803137582e-05, |
|
"loss": 1.9232, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.010759717007873105, |
|
"eval_loss": 1.903687834739685, |
|
"eval_runtime": 26.5449, |
|
"eval_samples_per_second": 17.97, |
|
"eval_steps_per_second": 0.377, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.011106804653288366, |
|
"grad_norm": 1.8057008981704712, |
|
"learning_rate": 1.1106483409690407e-05, |
|
"loss": 1.923, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.011106804653288366, |
|
"eval_loss": 1.9004727602005005, |
|
"eval_runtime": 26.3249, |
|
"eval_samples_per_second": 18.12, |
|
"eval_steps_per_second": 0.38, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.011453892298703628, |
|
"grad_norm": 1.6960588693618774, |
|
"learning_rate": 1.1453561016243232e-05, |
|
"loss": 1.9102, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.011453892298703628, |
|
"eval_loss": 1.8942128419876099, |
|
"eval_runtime": 26.351, |
|
"eval_samples_per_second": 18.102, |
|
"eval_steps_per_second": 0.379, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.011800979944118889, |
|
"grad_norm": 1.8168739080429077, |
|
"learning_rate": 1.1800638622796058e-05, |
|
"loss": 1.9074, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.011800979944118889, |
|
"eval_loss": 1.889912486076355, |
|
"eval_runtime": 26.273, |
|
"eval_samples_per_second": 18.155, |
|
"eval_steps_per_second": 0.381, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.012148067589534151, |
|
"grad_norm": 1.8745700120925903, |
|
"learning_rate": 1.2147716229348883e-05, |
|
"loss": 1.9053, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.012148067589534151, |
|
"eval_loss": 1.8899447917938232, |
|
"eval_runtime": 26.3575, |
|
"eval_samples_per_second": 18.097, |
|
"eval_steps_per_second": 0.379, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.012495155234949411, |
|
"grad_norm": 1.77675199508667, |
|
"learning_rate": 1.2494793835901708e-05, |
|
"loss": 1.8983, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.012495155234949411, |
|
"eval_loss": 1.8839936256408691, |
|
"eval_runtime": 26.4506, |
|
"eval_samples_per_second": 18.034, |
|
"eval_steps_per_second": 0.378, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.012842242880364674, |
|
"grad_norm": 1.473862648010254, |
|
"learning_rate": 1.2841871442454533e-05, |
|
"loss": 1.8998, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.012842242880364674, |
|
"eval_loss": 1.8791130781173706, |
|
"eval_runtime": 26.7984, |
|
"eval_samples_per_second": 17.8, |
|
"eval_steps_per_second": 0.373, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.013189330525779934, |
|
"grad_norm": 1.486438512802124, |
|
"learning_rate": 1.318894904900736e-05, |
|
"loss": 1.8951, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.013189330525779934, |
|
"eval_loss": 1.8784898519515991, |
|
"eval_runtime": 26.3412, |
|
"eval_samples_per_second": 18.109, |
|
"eval_steps_per_second": 0.38, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.013536418171195197, |
|
"grad_norm": 1.2373360395431519, |
|
"learning_rate": 1.3536026655560182e-05, |
|
"loss": 1.887, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.013536418171195197, |
|
"eval_loss": 1.870314121246338, |
|
"eval_runtime": 26.436, |
|
"eval_samples_per_second": 18.044, |
|
"eval_steps_per_second": 0.378, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.013883505816610457, |
|
"grad_norm": 1.5347727537155151, |
|
"learning_rate": 1.3883104262113009e-05, |
|
"loss": 1.8951, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.013883505816610457, |
|
"eval_loss": 1.8662590980529785, |
|
"eval_runtime": 26.4553, |
|
"eval_samples_per_second": 18.03, |
|
"eval_steps_per_second": 0.378, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.01423059346202572, |
|
"grad_norm": 1.771752953529358, |
|
"learning_rate": 1.4230181868665834e-05, |
|
"loss": 1.8854, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.01423059346202572, |
|
"eval_loss": 1.8646233081817627, |
|
"eval_runtime": 26.398, |
|
"eval_samples_per_second": 18.07, |
|
"eval_steps_per_second": 0.379, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.01457768110744098, |
|
"grad_norm": 2.278961420059204, |
|
"learning_rate": 1.457725947521866e-05, |
|
"loss": 1.8829, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.01457768110744098, |
|
"eval_loss": 1.8655200004577637, |
|
"eval_runtime": 26.4296, |
|
"eval_samples_per_second": 18.048, |
|
"eval_steps_per_second": 0.378, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.014924768752856242, |
|
"grad_norm": 1.965826392173767, |
|
"learning_rate": 1.4924337081771487e-05, |
|
"loss": 1.8768, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.014924768752856242, |
|
"eval_loss": 1.861385464668274, |
|
"eval_runtime": 26.4002, |
|
"eval_samples_per_second": 18.068, |
|
"eval_steps_per_second": 0.379, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.015271856398271503, |
|
"grad_norm": 1.2046449184417725, |
|
"learning_rate": 1.527141468832431e-05, |
|
"loss": 1.8789, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.015271856398271503, |
|
"eval_loss": 1.8564562797546387, |
|
"eval_runtime": 26.5199, |
|
"eval_samples_per_second": 17.986, |
|
"eval_steps_per_second": 0.377, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.015618944043686765, |
|
"grad_norm": 1.2664066553115845, |
|
"learning_rate": 1.5618492294877134e-05, |
|
"loss": 1.8676, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.015618944043686765, |
|
"eval_loss": 1.8561848402023315, |
|
"eval_runtime": 26.3757, |
|
"eval_samples_per_second": 18.085, |
|
"eval_steps_per_second": 0.379, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.015966031689102027, |
|
"grad_norm": 1.242021918296814, |
|
"learning_rate": 1.5965569901429962e-05, |
|
"loss": 1.8747, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.015966031689102027, |
|
"eval_loss": 1.8486684560775757, |
|
"eval_runtime": 26.3838, |
|
"eval_samples_per_second": 18.079, |
|
"eval_steps_per_second": 0.379, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.016313119334517286, |
|
"grad_norm": 1.7827301025390625, |
|
"learning_rate": 1.6312647507982788e-05, |
|
"loss": 1.8675, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.016313119334517286, |
|
"eval_loss": 1.848850131034851, |
|
"eval_runtime": 26.3928, |
|
"eval_samples_per_second": 18.073, |
|
"eval_steps_per_second": 0.379, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.01666020697993255, |
|
"grad_norm": 1.4154130220413208, |
|
"learning_rate": 1.665972511453561e-05, |
|
"loss": 1.8646, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.01666020697993255, |
|
"eval_loss": 1.8437469005584717, |
|
"eval_runtime": 26.3717, |
|
"eval_samples_per_second": 18.088, |
|
"eval_steps_per_second": 0.379, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.01700729462534781, |
|
"grad_norm": 1.556569218635559, |
|
"learning_rate": 1.7006802721088435e-05, |
|
"loss": 1.8633, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.01700729462534781, |
|
"eval_loss": 1.8424543142318726, |
|
"eval_runtime": 26.5267, |
|
"eval_samples_per_second": 17.982, |
|
"eval_steps_per_second": 0.377, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.017354382270763073, |
|
"grad_norm": 1.2393227815628052, |
|
"learning_rate": 1.735388032764126e-05, |
|
"loss": 1.8581, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.017354382270763073, |
|
"eval_loss": 1.8405553102493286, |
|
"eval_runtime": 26.3219, |
|
"eval_samples_per_second": 18.122, |
|
"eval_steps_per_second": 0.38, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.017701469916178332, |
|
"grad_norm": 1.141632318496704, |
|
"learning_rate": 1.770095793419409e-05, |
|
"loss": 1.8589, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.017701469916178332, |
|
"eval_loss": 1.8358362913131714, |
|
"eval_runtime": 26.3268, |
|
"eval_samples_per_second": 18.118, |
|
"eval_steps_per_second": 0.38, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.018048557561593594, |
|
"grad_norm": 1.1542439460754395, |
|
"learning_rate": 1.8048035540746914e-05, |
|
"loss": 1.8583, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.018048557561593594, |
|
"eval_loss": 1.8365525007247925, |
|
"eval_runtime": 26.4883, |
|
"eval_samples_per_second": 18.008, |
|
"eval_steps_per_second": 0.378, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.018395645207008857, |
|
"grad_norm": 1.0082952976226807, |
|
"learning_rate": 1.8395113147299736e-05, |
|
"loss": 1.8523, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.018395645207008857, |
|
"eval_loss": 1.83348548412323, |
|
"eval_runtime": 26.7213, |
|
"eval_samples_per_second": 17.851, |
|
"eval_steps_per_second": 0.374, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.01874273285242412, |
|
"grad_norm": 1.3992128372192383, |
|
"learning_rate": 1.874219075385256e-05, |
|
"loss": 1.8557, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.01874273285242412, |
|
"eval_loss": 1.830024003982544, |
|
"eval_runtime": 26.3985, |
|
"eval_samples_per_second": 18.069, |
|
"eval_steps_per_second": 0.379, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.019089820497839378, |
|
"grad_norm": 1.0521454811096191, |
|
"learning_rate": 1.9089268360405386e-05, |
|
"loss": 1.8541, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.019089820497839378, |
|
"eval_loss": 1.8300797939300537, |
|
"eval_runtime": 26.4761, |
|
"eval_samples_per_second": 18.016, |
|
"eval_steps_per_second": 0.378, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.01943690814325464, |
|
"grad_norm": 1.2668583393096924, |
|
"learning_rate": 1.9436345966958215e-05, |
|
"loss": 1.8453, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.01943690814325464, |
|
"eval_loss": 1.8309643268585205, |
|
"eval_runtime": 26.3911, |
|
"eval_samples_per_second": 18.074, |
|
"eval_steps_per_second": 0.379, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.019783995788669902, |
|
"grad_norm": 1.09402334690094, |
|
"learning_rate": 1.9783423573511037e-05, |
|
"loss": 1.8536, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.019783995788669902, |
|
"eval_loss": 1.8289095163345337, |
|
"eval_runtime": 26.4192, |
|
"eval_samples_per_second": 18.055, |
|
"eval_steps_per_second": 0.379, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.020131083434085165, |
|
"grad_norm": 0.960943341255188, |
|
"learning_rate": 2.0130501180063862e-05, |
|
"loss": 1.8509, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.020131083434085165, |
|
"eval_loss": 1.824504017829895, |
|
"eval_runtime": 26.471, |
|
"eval_samples_per_second": 18.02, |
|
"eval_steps_per_second": 0.378, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.020478171079500423, |
|
"grad_norm": 1.0958678722381592, |
|
"learning_rate": 2.0477578786616688e-05, |
|
"loss": 1.8452, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.020478171079500423, |
|
"eval_loss": 1.8239096403121948, |
|
"eval_runtime": 26.3508, |
|
"eval_samples_per_second": 18.102, |
|
"eval_steps_per_second": 0.379, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.020825258724915686, |
|
"grad_norm": 1.1169177293777466, |
|
"learning_rate": 2.0824656393169513e-05, |
|
"loss": 1.8431, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.020825258724915686, |
|
"eval_loss": 1.8211857080459595, |
|
"eval_runtime": 26.3842, |
|
"eval_samples_per_second": 18.079, |
|
"eval_steps_per_second": 0.379, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.021172346370330948, |
|
"grad_norm": 1.2566906213760376, |
|
"learning_rate": 2.117173399972234e-05, |
|
"loss": 1.8399, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.021172346370330948, |
|
"eval_loss": 1.8177697658538818, |
|
"eval_runtime": 26.4481, |
|
"eval_samples_per_second": 18.035, |
|
"eval_steps_per_second": 0.378, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.02151943401574621, |
|
"grad_norm": 1.0141446590423584, |
|
"learning_rate": 2.1518811606275163e-05, |
|
"loss": 1.8412, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.02151943401574621, |
|
"eval_loss": 1.8186702728271484, |
|
"eval_runtime": 26.5203, |
|
"eval_samples_per_second": 17.986, |
|
"eval_steps_per_second": 0.377, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.021866521661161473, |
|
"grad_norm": 1.1088286638259888, |
|
"learning_rate": 2.186588921282799e-05, |
|
"loss": 1.8433, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.021866521661161473, |
|
"eval_loss": 1.8170266151428223, |
|
"eval_runtime": 26.8479, |
|
"eval_samples_per_second": 17.767, |
|
"eval_steps_per_second": 0.372, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.02221360930657673, |
|
"grad_norm": 1.0931763648986816, |
|
"learning_rate": 2.2212966819380814e-05, |
|
"loss": 1.8363, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.02221360930657673, |
|
"eval_loss": 1.81370210647583, |
|
"eval_runtime": 26.4259, |
|
"eval_samples_per_second": 18.05, |
|
"eval_steps_per_second": 0.378, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.022560696951991994, |
|
"grad_norm": 1.040462851524353, |
|
"learning_rate": 2.256004442593364e-05, |
|
"loss": 1.8339, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.022560696951991994, |
|
"eval_loss": 1.8151589632034302, |
|
"eval_runtime": 26.44, |
|
"eval_samples_per_second": 18.041, |
|
"eval_steps_per_second": 0.378, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.022907784597407256, |
|
"grad_norm": 1.0170607566833496, |
|
"learning_rate": 2.2907122032486464e-05, |
|
"loss": 1.833, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.022907784597407256, |
|
"eval_loss": 1.8107798099517822, |
|
"eval_runtime": 26.3941, |
|
"eval_samples_per_second": 18.072, |
|
"eval_steps_per_second": 0.379, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.02325487224282252, |
|
"grad_norm": 0.813853919506073, |
|
"learning_rate": 2.325419963903929e-05, |
|
"loss": 1.8338, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.02325487224282252, |
|
"eval_loss": 1.8114594221115112, |
|
"eval_runtime": 26.3741, |
|
"eval_samples_per_second": 18.086, |
|
"eval_steps_per_second": 0.379, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.023601959888237777, |
|
"grad_norm": 1.4288556575775146, |
|
"learning_rate": 2.3601277245592115e-05, |
|
"loss": 1.8301, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.023601959888237777, |
|
"eval_loss": 1.8073900938034058, |
|
"eval_runtime": 26.3565, |
|
"eval_samples_per_second": 18.098, |
|
"eval_steps_per_second": 0.379, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.02394904753365304, |
|
"grad_norm": 0.8318554162979126, |
|
"learning_rate": 2.394835485214494e-05, |
|
"loss": 1.8293, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.02394904753365304, |
|
"eval_loss": 1.80862295627594, |
|
"eval_runtime": 26.4221, |
|
"eval_samples_per_second": 18.053, |
|
"eval_steps_per_second": 0.378, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.024296135179068302, |
|
"grad_norm": 0.9045419096946716, |
|
"learning_rate": 2.4295432458697766e-05, |
|
"loss": 1.8324, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.024296135179068302, |
|
"eval_loss": 1.805204153060913, |
|
"eval_runtime": 26.4185, |
|
"eval_samples_per_second": 18.056, |
|
"eval_steps_per_second": 0.379, |
|
"step": 7000 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 288111, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 200, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.46085192400896e+18, |
|
"train_batch_size": 48, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|