|
{ |
|
"best_metric": 0.6768932938575745, |
|
"best_model_checkpoint": "./checkpoints/llava-v1.6-vicuna-7b/checkpoint-250", |
|
"epoch": 10.0, |
|
"eval_steps": 1.0, |
|
"global_step": 320, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03125, |
|
"grad_norm": 1.0817695604199613, |
|
"learning_rate": 0.0, |
|
"loss": 1.3872, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.03125, |
|
"eval_loss": 1.4023343324661255, |
|
"eval_runtime": 35.2562, |
|
"eval_samples_per_second": 5.673, |
|
"eval_steps_per_second": 0.369, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0625, |
|
"grad_norm": 0.8573794343563677, |
|
"learning_rate": 8.613531161467863e-06, |
|
"loss": 1.3352, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0625, |
|
"eval_loss": 1.4023343324661255, |
|
"eval_runtime": 27.8829, |
|
"eval_samples_per_second": 7.173, |
|
"eval_steps_per_second": 0.466, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.09375, |
|
"grad_norm": 0.8545279010393898, |
|
"learning_rate": 1.3652123889719709e-05, |
|
"loss": 1.3838, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.09375, |
|
"eval_loss": 1.3825562000274658, |
|
"eval_runtime": 27.9018, |
|
"eval_samples_per_second": 7.168, |
|
"eval_steps_per_second": 0.466, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"grad_norm": 0.7747695318679186, |
|
"learning_rate": 1.7227062322935725e-05, |
|
"loss": 1.3442, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"eval_loss": 1.3529690504074097, |
|
"eval_runtime": 27.9234, |
|
"eval_samples_per_second": 7.162, |
|
"eval_steps_per_second": 0.466, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.15625, |
|
"grad_norm": 0.9223438945487747, |
|
"learning_rate": 2e-05, |
|
"loss": 1.3265, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.15625, |
|
"eval_loss": 1.3111159801483154, |
|
"eval_runtime": 27.8183, |
|
"eval_samples_per_second": 7.19, |
|
"eval_steps_per_second": 0.467, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.1875, |
|
"grad_norm": 0.8553066709777654, |
|
"learning_rate": 2e-05, |
|
"loss": 1.2969, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.1875, |
|
"eval_loss": 1.267953634262085, |
|
"eval_runtime": 28.5087, |
|
"eval_samples_per_second": 7.015, |
|
"eval_steps_per_second": 0.456, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.21875, |
|
"grad_norm": 0.7513319744508511, |
|
"learning_rate": 2e-05, |
|
"loss": 1.2643, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.21875, |
|
"eval_loss": 1.2324440479278564, |
|
"eval_runtime": 28.7026, |
|
"eval_samples_per_second": 6.968, |
|
"eval_steps_per_second": 0.453, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.5926161530676572, |
|
"learning_rate": 2e-05, |
|
"loss": 1.2343, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"eval_loss": 1.2082672119140625, |
|
"eval_runtime": 28.709, |
|
"eval_samples_per_second": 6.966, |
|
"eval_steps_per_second": 0.453, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.28125, |
|
"grad_norm": 0.45585108261607465, |
|
"learning_rate": 2e-05, |
|
"loss": 1.2556, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.28125, |
|
"eval_loss": 1.1897780895233154, |
|
"eval_runtime": 28.5026, |
|
"eval_samples_per_second": 7.017, |
|
"eval_steps_per_second": 0.456, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"grad_norm": 0.45306175711380503, |
|
"learning_rate": 2e-05, |
|
"loss": 1.1941, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"eval_loss": 1.1719207763671875, |
|
"eval_runtime": 28.4252, |
|
"eval_samples_per_second": 7.036, |
|
"eval_steps_per_second": 0.457, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.34375, |
|
"grad_norm": 0.40702053502599356, |
|
"learning_rate": 2e-05, |
|
"loss": 1.2414, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.34375, |
|
"eval_loss": 1.1534627676010132, |
|
"eval_runtime": 31.953, |
|
"eval_samples_per_second": 6.259, |
|
"eval_steps_per_second": 0.407, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.375, |
|
"grad_norm": 0.45771435281195333, |
|
"learning_rate": 2e-05, |
|
"loss": 1.202, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.375, |
|
"eval_loss": 1.1343497037887573, |
|
"eval_runtime": 31.7064, |
|
"eval_samples_per_second": 6.308, |
|
"eval_steps_per_second": 0.41, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.40625, |
|
"grad_norm": 0.49237132802399297, |
|
"learning_rate": 2e-05, |
|
"loss": 1.2167, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.40625, |
|
"eval_loss": 1.1149284839630127, |
|
"eval_runtime": 31.7514, |
|
"eval_samples_per_second": 6.299, |
|
"eval_steps_per_second": 0.409, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.4375, |
|
"grad_norm": 0.4707558788321445, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0463, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.4375, |
|
"eval_loss": 1.0956928730010986, |
|
"eval_runtime": 30.7821, |
|
"eval_samples_per_second": 6.497, |
|
"eval_steps_per_second": 0.422, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.46875, |
|
"grad_norm": 0.44161060970171445, |
|
"learning_rate": 2e-05, |
|
"loss": 1.1615, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.46875, |
|
"eval_loss": 1.0776234865188599, |
|
"eval_runtime": 30.5336, |
|
"eval_samples_per_second": 6.55, |
|
"eval_steps_per_second": 0.426, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.43310242386256154, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0941, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"eval_loss": 1.061128854751587, |
|
"eval_runtime": 33.8247, |
|
"eval_samples_per_second": 5.913, |
|
"eval_steps_per_second": 0.384, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.53125, |
|
"grad_norm": 0.3719623439057395, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0992, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.53125, |
|
"eval_loss": 1.0465847253799438, |
|
"eval_runtime": 32.7443, |
|
"eval_samples_per_second": 6.108, |
|
"eval_steps_per_second": 0.397, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.5625, |
|
"grad_norm": 0.42266460981580545, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0904, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.5625, |
|
"eval_loss": 1.0327677726745605, |
|
"eval_runtime": 32.5697, |
|
"eval_samples_per_second": 6.141, |
|
"eval_steps_per_second": 0.399, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.59375, |
|
"grad_norm": 0.35416098431161336, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0055, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.59375, |
|
"eval_loss": 1.019870638847351, |
|
"eval_runtime": 32.6927, |
|
"eval_samples_per_second": 6.118, |
|
"eval_steps_per_second": 0.398, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 0.3454390449296124, |
|
"learning_rate": 2e-05, |
|
"loss": 1.1291, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"eval_loss": 1.008323311805725, |
|
"eval_runtime": 32.5051, |
|
"eval_samples_per_second": 6.153, |
|
"eval_steps_per_second": 0.4, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.65625, |
|
"grad_norm": 0.291766075949861, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0363, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.65625, |
|
"eval_loss": 0.9983346462249756, |
|
"eval_runtime": 36.1543, |
|
"eval_samples_per_second": 5.532, |
|
"eval_steps_per_second": 0.36, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.6875, |
|
"grad_norm": 0.3071914269593122, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0869, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.6875, |
|
"eval_loss": 0.989651083946228, |
|
"eval_runtime": 35.9583, |
|
"eval_samples_per_second": 5.562, |
|
"eval_steps_per_second": 0.362, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.71875, |
|
"grad_norm": 0.2642686659789585, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0706, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.71875, |
|
"eval_loss": 0.981977641582489, |
|
"eval_runtime": 35.7624, |
|
"eval_samples_per_second": 5.592, |
|
"eval_steps_per_second": 0.364, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.23789134722319716, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0669, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"eval_loss": 0.9751532077789307, |
|
"eval_runtime": 35.6905, |
|
"eval_samples_per_second": 5.604, |
|
"eval_steps_per_second": 0.364, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.78125, |
|
"grad_norm": 0.26302325685095884, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0141, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.78125, |
|
"eval_loss": 0.9684178233146667, |
|
"eval_runtime": 35.4693, |
|
"eval_samples_per_second": 5.639, |
|
"eval_steps_per_second": 0.367, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.8125, |
|
"grad_norm": 0.2406662725995088, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0381, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.8125, |
|
"eval_loss": 0.9618947505950928, |
|
"eval_runtime": 37.5325, |
|
"eval_samples_per_second": 5.329, |
|
"eval_steps_per_second": 0.346, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.84375, |
|
"grad_norm": 0.27899113172875245, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9693, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.84375, |
|
"eval_loss": 0.9552007913589478, |
|
"eval_runtime": 37.4006, |
|
"eval_samples_per_second": 5.348, |
|
"eval_steps_per_second": 0.348, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.875, |
|
"grad_norm": 0.29303174930955905, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9841, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.875, |
|
"eval_loss": 0.9481881856918335, |
|
"eval_runtime": 37.7821, |
|
"eval_samples_per_second": 5.294, |
|
"eval_steps_per_second": 0.344, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.90625, |
|
"grad_norm": 0.22138226087715307, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9959, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.90625, |
|
"eval_loss": 0.9415397644042969, |
|
"eval_runtime": 37.9058, |
|
"eval_samples_per_second": 5.276, |
|
"eval_steps_per_second": 0.343, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.9375, |
|
"grad_norm": 0.23456101188675513, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0351, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.9375, |
|
"eval_loss": 0.9354143738746643, |
|
"eval_runtime": 37.9727, |
|
"eval_samples_per_second": 5.267, |
|
"eval_steps_per_second": 0.342, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.96875, |
|
"grad_norm": 0.2594838155429295, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8741, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.96875, |
|
"eval_loss": 0.9291737079620361, |
|
"eval_runtime": 37.081, |
|
"eval_samples_per_second": 5.394, |
|
"eval_steps_per_second": 0.351, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.2404582058613114, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9814, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.9231625199317932, |
|
"eval_runtime": 37.0946, |
|
"eval_samples_per_second": 5.392, |
|
"eval_steps_per_second": 0.35, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 1.03125, |
|
"grad_norm": 0.26862391186560797, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0241, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 1.03125, |
|
"eval_loss": 0.917277991771698, |
|
"eval_runtime": 37.1872, |
|
"eval_samples_per_second": 5.378, |
|
"eval_steps_per_second": 0.35, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 1.0625, |
|
"grad_norm": 0.24997341491489666, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0296, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 1.0625, |
|
"eval_loss": 0.9116549491882324, |
|
"eval_runtime": 30.7053, |
|
"eval_samples_per_second": 6.514, |
|
"eval_steps_per_second": 0.423, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 1.09375, |
|
"grad_norm": 0.22755062908849677, |
|
"learning_rate": 2e-05, |
|
"loss": 1.047, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.09375, |
|
"eval_loss": 0.9061525464057922, |
|
"eval_runtime": 30.5238, |
|
"eval_samples_per_second": 6.552, |
|
"eval_steps_per_second": 0.426, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.125, |
|
"grad_norm": 0.2478793998097894, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0071, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.125, |
|
"eval_loss": 0.9007319808006287, |
|
"eval_runtime": 30.4573, |
|
"eval_samples_per_second": 6.567, |
|
"eval_steps_per_second": 0.427, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.15625, |
|
"grad_norm": 0.2319702521014333, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9517, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 1.15625, |
|
"eval_loss": 0.8955077528953552, |
|
"eval_runtime": 30.6396, |
|
"eval_samples_per_second": 6.528, |
|
"eval_steps_per_second": 0.424, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 1.1875, |
|
"grad_norm": 0.26929965642782505, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9638, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 1.1875, |
|
"eval_loss": 0.8906582593917847, |
|
"eval_runtime": 30.5706, |
|
"eval_samples_per_second": 6.542, |
|
"eval_steps_per_second": 0.425, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 1.21875, |
|
"grad_norm": 0.25494286133089294, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9922, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 1.21875, |
|
"eval_loss": 0.8858879804611206, |
|
"eval_runtime": 30.2267, |
|
"eval_samples_per_second": 6.617, |
|
"eval_steps_per_second": 0.43, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.2468866713698415, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9873, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"eval_loss": 0.8811590671539307, |
|
"eval_runtime": 30.1065, |
|
"eval_samples_per_second": 6.643, |
|
"eval_steps_per_second": 0.432, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.28125, |
|
"grad_norm": 0.2460619663724958, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9608, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 1.28125, |
|
"eval_loss": 0.876426637172699, |
|
"eval_runtime": 30.2618, |
|
"eval_samples_per_second": 6.609, |
|
"eval_steps_per_second": 0.43, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 1.3125, |
|
"grad_norm": 0.244111044045335, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9496, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.3125, |
|
"eval_loss": 0.8720347881317139, |
|
"eval_runtime": 30.2637, |
|
"eval_samples_per_second": 6.609, |
|
"eval_steps_per_second": 0.43, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.34375, |
|
"grad_norm": 0.24263485999072093, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9076, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 1.34375, |
|
"eval_loss": 0.8677232265472412, |
|
"eval_runtime": 30.0588, |
|
"eval_samples_per_second": 6.654, |
|
"eval_steps_per_second": 0.432, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 1.375, |
|
"grad_norm": 0.2549786588443146, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9291, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 1.375, |
|
"eval_loss": 0.864047110080719, |
|
"eval_runtime": 30.3833, |
|
"eval_samples_per_second": 6.583, |
|
"eval_steps_per_second": 0.428, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 1.40625, |
|
"grad_norm": 0.27020952324959413, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9111, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.40625, |
|
"eval_loss": 0.8608524799346924, |
|
"eval_runtime": 30.284, |
|
"eval_samples_per_second": 6.604, |
|
"eval_steps_per_second": 0.429, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.4375, |
|
"grad_norm": 0.24108750741309573, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8363, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.4375, |
|
"eval_loss": 0.8525222539901733, |
|
"eval_runtime": 51.3231, |
|
"eval_samples_per_second": 3.897, |
|
"eval_steps_per_second": 0.487, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.46875, |
|
"grad_norm": 0.23963570627035977, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9776, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 1.46875, |
|
"eval_loss": 0.8498736619949341, |
|
"eval_runtime": 43.9039, |
|
"eval_samples_per_second": 4.555, |
|
"eval_steps_per_second": 0.569, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.2738559790360609, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9075, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"eval_loss": 0.846975564956665, |
|
"eval_runtime": 43.6943, |
|
"eval_samples_per_second": 4.577, |
|
"eval_steps_per_second": 0.572, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.53125, |
|
"grad_norm": 0.2516715524185528, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9256, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.53125, |
|
"eval_loss": 0.8441421985626221, |
|
"eval_runtime": 44.0977, |
|
"eval_samples_per_second": 4.535, |
|
"eval_steps_per_second": 0.567, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.5625, |
|
"grad_norm": 0.25797542568004944, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9168, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.5625, |
|
"eval_loss": 0.8408769369125366, |
|
"eval_runtime": 45.4442, |
|
"eval_samples_per_second": 4.401, |
|
"eval_steps_per_second": 0.55, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.59375, |
|
"grad_norm": 0.24530872900913284, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8547, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 1.59375, |
|
"eval_loss": 0.8373726010322571, |
|
"eval_runtime": 44.6363, |
|
"eval_samples_per_second": 4.481, |
|
"eval_steps_per_second": 0.56, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 1.625, |
|
"grad_norm": 0.2549609506617865, |
|
"learning_rate": 2e-05, |
|
"loss": 0.979, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.625, |
|
"eval_loss": 0.8340890407562256, |
|
"eval_runtime": 45.991, |
|
"eval_samples_per_second": 4.349, |
|
"eval_steps_per_second": 0.544, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.65625, |
|
"grad_norm": 0.24114496664848603, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9196, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 1.65625, |
|
"eval_loss": 0.8311529755592346, |
|
"eval_runtime": 46.0654, |
|
"eval_samples_per_second": 4.342, |
|
"eval_steps_per_second": 0.543, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 1.6875, |
|
"grad_norm": 0.29287872202759435, |
|
"learning_rate": 2e-05, |
|
"loss": 0.967, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.6875, |
|
"eval_loss": 0.8281388282775879, |
|
"eval_runtime": 46.0396, |
|
"eval_samples_per_second": 4.344, |
|
"eval_steps_per_second": 0.543, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.71875, |
|
"grad_norm": 0.2620663114325604, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9576, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.71875, |
|
"eval_loss": 0.8252360820770264, |
|
"eval_runtime": 44.8935, |
|
"eval_samples_per_second": 4.455, |
|
"eval_steps_per_second": 0.557, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.24813796796229484, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9652, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"eval_loss": 0.8228487968444824, |
|
"eval_runtime": 45.9424, |
|
"eval_samples_per_second": 4.353, |
|
"eval_steps_per_second": 0.544, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.78125, |
|
"grad_norm": 0.25644243214043555, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8938, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 1.78125, |
|
"eval_loss": 0.8202834129333496, |
|
"eval_runtime": 45.4583, |
|
"eval_samples_per_second": 4.4, |
|
"eval_steps_per_second": 0.55, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 1.8125, |
|
"grad_norm": 0.24429328723074778, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9373, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.8125, |
|
"eval_loss": 0.8179032802581787, |
|
"eval_runtime": 45.7499, |
|
"eval_samples_per_second": 4.372, |
|
"eval_steps_per_second": 0.546, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.84375, |
|
"grad_norm": 0.26226013327841075, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8474, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 1.84375, |
|
"eval_loss": 0.8154602646827698, |
|
"eval_runtime": 46.1391, |
|
"eval_samples_per_second": 4.335, |
|
"eval_steps_per_second": 0.542, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 1.875, |
|
"grad_norm": 0.2581666046262149, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8517, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.875, |
|
"eval_loss": 0.812771737575531, |
|
"eval_runtime": 45.5621, |
|
"eval_samples_per_second": 4.39, |
|
"eval_steps_per_second": 0.549, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.90625, |
|
"grad_norm": 0.2593197258112398, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9011, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 1.90625, |
|
"eval_loss": 0.810187816619873, |
|
"eval_runtime": 46.0597, |
|
"eval_samples_per_second": 4.342, |
|
"eval_steps_per_second": 0.543, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 1.9375, |
|
"grad_norm": 0.2899895571193183, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9277, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.9375, |
|
"eval_loss": 0.8083757758140564, |
|
"eval_runtime": 45.8079, |
|
"eval_samples_per_second": 4.366, |
|
"eval_steps_per_second": 0.546, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.96875, |
|
"grad_norm": 0.2759215195414453, |
|
"learning_rate": 2e-05, |
|
"loss": 0.772, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 1.96875, |
|
"eval_loss": 0.8061204552650452, |
|
"eval_runtime": 47.3286, |
|
"eval_samples_per_second": 4.226, |
|
"eval_steps_per_second": 0.528, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.27248680511516205, |
|
"learning_rate": 2e-05, |
|
"loss": 0.874, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.8037504553794861, |
|
"eval_runtime": 46.1177, |
|
"eval_samples_per_second": 4.337, |
|
"eval_steps_per_second": 0.542, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 2.03125, |
|
"grad_norm": 0.3116755816558186, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8647, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 2.03125, |
|
"eval_loss": 0.8007115125656128, |
|
"eval_runtime": 46.1583, |
|
"eval_samples_per_second": 4.333, |
|
"eval_steps_per_second": 0.542, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 2.0625, |
|
"grad_norm": 0.273032515206887, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8862, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 2.0625, |
|
"eval_loss": 0.7983976006507874, |
|
"eval_runtime": 47.3469, |
|
"eval_samples_per_second": 4.224, |
|
"eval_steps_per_second": 0.528, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 2.09375, |
|
"grad_norm": 0.2925240383907651, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8617, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 2.09375, |
|
"eval_loss": 0.7959001064300537, |
|
"eval_runtime": 47.9208, |
|
"eval_samples_per_second": 4.174, |
|
"eval_steps_per_second": 0.522, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 2.125, |
|
"grad_norm": 0.25775933439981163, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9269, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 2.125, |
|
"eval_loss": 0.7938115000724792, |
|
"eval_runtime": 47.8909, |
|
"eval_samples_per_second": 4.176, |
|
"eval_steps_per_second": 0.522, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 2.15625, |
|
"grad_norm": 0.2669684013704678, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8607, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 2.15625, |
|
"eval_loss": 0.7918573617935181, |
|
"eval_runtime": 47.39, |
|
"eval_samples_per_second": 4.22, |
|
"eval_steps_per_second": 0.528, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 2.1875, |
|
"grad_norm": 0.312578346444957, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8086, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 2.1875, |
|
"eval_loss": 0.7894810438156128, |
|
"eval_runtime": 46.2927, |
|
"eval_samples_per_second": 4.32, |
|
"eval_steps_per_second": 0.54, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 2.21875, |
|
"grad_norm": 0.25622754870894693, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8945, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 2.21875, |
|
"eval_loss": 0.7875316739082336, |
|
"eval_runtime": 45.7617, |
|
"eval_samples_per_second": 4.37, |
|
"eval_steps_per_second": 0.546, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 0.27025767580736354, |
|
"learning_rate": 2e-05, |
|
"loss": 0.815, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"eval_loss": 0.7858334183692932, |
|
"eval_runtime": 46.2427, |
|
"eval_samples_per_second": 4.325, |
|
"eval_steps_per_second": 0.541, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 2.28125, |
|
"grad_norm": 0.3110479115695806, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8621, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 2.28125, |
|
"eval_loss": 0.7841551303863525, |
|
"eval_runtime": 46.5372, |
|
"eval_samples_per_second": 4.298, |
|
"eval_steps_per_second": 0.537, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 2.3125, |
|
"grad_norm": 0.26061305588172545, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8622, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 2.3125, |
|
"eval_loss": 0.7826495170593262, |
|
"eval_runtime": 46.1361, |
|
"eval_samples_per_second": 4.335, |
|
"eval_steps_per_second": 0.542, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 2.34375, |
|
"grad_norm": 0.27448719719872205, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9118, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 2.34375, |
|
"eval_loss": 0.7811364531517029, |
|
"eval_runtime": 47.6194, |
|
"eval_samples_per_second": 4.2, |
|
"eval_steps_per_second": 0.525, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 2.375, |
|
"grad_norm": 0.27078145092639194, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8256, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 2.375, |
|
"eval_loss": 0.779961109161377, |
|
"eval_runtime": 46.0097, |
|
"eval_samples_per_second": 4.347, |
|
"eval_steps_per_second": 0.543, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 2.40625, |
|
"grad_norm": 0.2634646272324293, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8774, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 2.40625, |
|
"eval_loss": 0.7788712978363037, |
|
"eval_runtime": 46.2712, |
|
"eval_samples_per_second": 4.322, |
|
"eval_steps_per_second": 0.54, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 2.4375, |
|
"grad_norm": 0.3101668401682978, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8769, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 2.4375, |
|
"eval_loss": 0.7776928544044495, |
|
"eval_runtime": 46.3791, |
|
"eval_samples_per_second": 4.312, |
|
"eval_steps_per_second": 0.539, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 2.46875, |
|
"grad_norm": 0.28798302574187284, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8765, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 2.46875, |
|
"eval_loss": 0.7773044109344482, |
|
"eval_runtime": 43.9352, |
|
"eval_samples_per_second": 4.552, |
|
"eval_steps_per_second": 0.569, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.3349887736240022, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9202, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"eval_loss": 0.7766420245170593, |
|
"eval_runtime": 44.0118, |
|
"eval_samples_per_second": 4.544, |
|
"eval_steps_per_second": 0.568, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 2.53125, |
|
"grad_norm": 0.3272989979927921, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8496, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 2.53125, |
|
"eval_loss": 0.7754170894622803, |
|
"eval_runtime": 44.5079, |
|
"eval_samples_per_second": 4.494, |
|
"eval_steps_per_second": 0.562, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 2.5625, |
|
"grad_norm": 0.2937867633662159, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9088, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 2.5625, |
|
"eval_loss": 0.7740327715873718, |
|
"eval_runtime": 43.7759, |
|
"eval_samples_per_second": 4.569, |
|
"eval_steps_per_second": 0.571, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 2.59375, |
|
"grad_norm": 0.3001827875228488, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8514, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 2.59375, |
|
"eval_loss": 0.7725099921226501, |
|
"eval_runtime": 43.9246, |
|
"eval_samples_per_second": 4.553, |
|
"eval_steps_per_second": 0.569, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 2.625, |
|
"grad_norm": 0.3153202233063334, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8232, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 2.625, |
|
"eval_loss": 0.7707765698432922, |
|
"eval_runtime": 45.7981, |
|
"eval_samples_per_second": 4.367, |
|
"eval_steps_per_second": 0.546, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 2.65625, |
|
"grad_norm": 0.3084122812305825, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7899, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 2.65625, |
|
"eval_loss": 0.7689283490180969, |
|
"eval_runtime": 43.8712, |
|
"eval_samples_per_second": 4.559, |
|
"eval_steps_per_second": 0.57, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 2.6875, |
|
"grad_norm": 0.34994590801092706, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8186, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 2.6875, |
|
"eval_loss": 0.7668275237083435, |
|
"eval_runtime": 44.0477, |
|
"eval_samples_per_second": 4.541, |
|
"eval_steps_per_second": 0.568, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 2.71875, |
|
"grad_norm": 0.33626535961990944, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8439, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 2.71875, |
|
"eval_loss": 0.7653672695159912, |
|
"eval_runtime": 43.9923, |
|
"eval_samples_per_second": 4.546, |
|
"eval_steps_per_second": 0.568, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 0.33991458856080364, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9309, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"eval_loss": 0.7641142010688782, |
|
"eval_runtime": 44.018, |
|
"eval_samples_per_second": 4.544, |
|
"eval_steps_per_second": 0.568, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 2.78125, |
|
"grad_norm": 0.3212547051979476, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8262, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 2.78125, |
|
"eval_loss": 0.763224720954895, |
|
"eval_runtime": 43.7722, |
|
"eval_samples_per_second": 4.569, |
|
"eval_steps_per_second": 0.571, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 2.8125, |
|
"grad_norm": 0.335120027091876, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8795, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 2.8125, |
|
"eval_loss": 0.7624655365943909, |
|
"eval_runtime": 44.1972, |
|
"eval_samples_per_second": 4.525, |
|
"eval_steps_per_second": 0.566, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 2.84375, |
|
"grad_norm": 0.33822766071160937, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7798, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 2.84375, |
|
"eval_loss": 0.761708676815033, |
|
"eval_runtime": 43.8244, |
|
"eval_samples_per_second": 4.564, |
|
"eval_steps_per_second": 0.57, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 2.875, |
|
"grad_norm": 0.33505853726890483, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8715, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 2.875, |
|
"eval_loss": 0.7611495852470398, |
|
"eval_runtime": 43.7833, |
|
"eval_samples_per_second": 4.568, |
|
"eval_steps_per_second": 0.571, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 2.90625, |
|
"grad_norm": 0.3126942865091584, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8102, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 2.90625, |
|
"eval_loss": 0.7608107924461365, |
|
"eval_runtime": 44.0119, |
|
"eval_samples_per_second": 4.544, |
|
"eval_steps_per_second": 0.568, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 2.9375, |
|
"grad_norm": 0.3594152593867412, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8871, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 2.9375, |
|
"eval_loss": 0.7598913311958313, |
|
"eval_runtime": 43.8956, |
|
"eval_samples_per_second": 4.556, |
|
"eval_steps_per_second": 0.57, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 2.96875, |
|
"grad_norm": 0.3161380007473764, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8278, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 2.96875, |
|
"eval_loss": 0.7596660852432251, |
|
"eval_runtime": 44.0687, |
|
"eval_samples_per_second": 4.538, |
|
"eval_steps_per_second": 0.567, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.3922097294803287, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7988, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.7576884627342224, |
|
"eval_runtime": 44.1881, |
|
"eval_samples_per_second": 4.526, |
|
"eval_steps_per_second": 0.566, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 3.03125, |
|
"grad_norm": 0.372234038126675, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7558, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 3.03125, |
|
"eval_loss": 0.7546435594558716, |
|
"eval_runtime": 43.8881, |
|
"eval_samples_per_second": 4.557, |
|
"eval_steps_per_second": 0.57, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 3.0625, |
|
"grad_norm": 0.3249396043376576, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8422, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 3.0625, |
|
"eval_loss": 0.7515354752540588, |
|
"eval_runtime": 44.5887, |
|
"eval_samples_per_second": 4.485, |
|
"eval_steps_per_second": 0.561, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 3.09375, |
|
"grad_norm": 0.3194387311297811, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8059, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 3.09375, |
|
"eval_loss": 0.7486842274665833, |
|
"eval_runtime": 44.0967, |
|
"eval_samples_per_second": 4.535, |
|
"eval_steps_per_second": 0.567, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 3.125, |
|
"grad_norm": 0.3434194037136213, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8341, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 3.125, |
|
"eval_loss": 0.7464652061462402, |
|
"eval_runtime": 44.0666, |
|
"eval_samples_per_second": 4.539, |
|
"eval_steps_per_second": 0.567, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 3.15625, |
|
"grad_norm": 0.33666008484696835, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7731, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 3.15625, |
|
"eval_loss": 0.7450191378593445, |
|
"eval_runtime": 44.0337, |
|
"eval_samples_per_second": 4.542, |
|
"eval_steps_per_second": 0.568, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 3.1875, |
|
"grad_norm": 0.3596265575837954, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8354, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 3.1875, |
|
"eval_loss": 0.7442840337753296, |
|
"eval_runtime": 44.0804, |
|
"eval_samples_per_second": 4.537, |
|
"eval_steps_per_second": 0.567, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 3.21875, |
|
"grad_norm": 0.37228869739935877, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8476, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 3.21875, |
|
"eval_loss": 0.74405837059021, |
|
"eval_runtime": 43.9201, |
|
"eval_samples_per_second": 4.554, |
|
"eval_steps_per_second": 0.569, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 0.372126737706513, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7568, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"eval_loss": 0.7435027360916138, |
|
"eval_runtime": 44.0105, |
|
"eval_samples_per_second": 4.544, |
|
"eval_steps_per_second": 0.568, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 3.28125, |
|
"grad_norm": 0.3362686942090606, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8035, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 3.28125, |
|
"eval_loss": 0.7431904673576355, |
|
"eval_runtime": 43.9113, |
|
"eval_samples_per_second": 4.555, |
|
"eval_steps_per_second": 0.569, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 3.3125, |
|
"grad_norm": 0.36392229188159225, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8353, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 3.3125, |
|
"eval_loss": 0.7430496215820312, |
|
"eval_runtime": 44.6371, |
|
"eval_samples_per_second": 4.481, |
|
"eval_steps_per_second": 0.56, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 3.34375, |
|
"grad_norm": 0.4471327905090859, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7363, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 3.34375, |
|
"eval_loss": 0.7411425709724426, |
|
"eval_runtime": 44.7094, |
|
"eval_samples_per_second": 4.473, |
|
"eval_steps_per_second": 0.559, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 3.375, |
|
"grad_norm": 0.3716356236311949, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7774, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 3.375, |
|
"eval_loss": 0.7391970753669739, |
|
"eval_runtime": 44.6877, |
|
"eval_samples_per_second": 4.476, |
|
"eval_steps_per_second": 0.559, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 3.40625, |
|
"grad_norm": 0.39848151618324823, |
|
"learning_rate": 2e-05, |
|
"loss": 0.766, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 3.40625, |
|
"eval_loss": 0.7370663285255432, |
|
"eval_runtime": 44.7716, |
|
"eval_samples_per_second": 4.467, |
|
"eval_steps_per_second": 0.558, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 3.4375, |
|
"grad_norm": 0.3979613694284285, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7647, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 3.4375, |
|
"eval_loss": 0.7347142100334167, |
|
"eval_runtime": 46.1551, |
|
"eval_samples_per_second": 4.333, |
|
"eval_steps_per_second": 0.542, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 3.46875, |
|
"grad_norm": 0.4005021474949748, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8363, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 3.46875, |
|
"eval_loss": 0.7330761551856995, |
|
"eval_runtime": 45.4921, |
|
"eval_samples_per_second": 4.396, |
|
"eval_steps_per_second": 0.55, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 0.3814831442952738, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8172, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"eval_loss": 0.7321842908859253, |
|
"eval_runtime": 46.3117, |
|
"eval_samples_per_second": 4.319, |
|
"eval_steps_per_second": 0.54, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 3.53125, |
|
"grad_norm": 0.37084330088188894, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8984, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 3.53125, |
|
"eval_loss": 0.7323736548423767, |
|
"eval_runtime": 45.7394, |
|
"eval_samples_per_second": 4.373, |
|
"eval_steps_per_second": 0.547, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 3.5625, |
|
"grad_norm": 0.4074607742772961, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7623, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 3.5625, |
|
"eval_loss": 0.7331156134605408, |
|
"eval_runtime": 47.2117, |
|
"eval_samples_per_second": 4.236, |
|
"eval_steps_per_second": 0.53, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 3.59375, |
|
"grad_norm": 0.3478981526620727, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8294, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 3.59375, |
|
"eval_loss": 0.7339057326316833, |
|
"eval_runtime": 45.3783, |
|
"eval_samples_per_second": 4.407, |
|
"eval_steps_per_second": 0.551, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 3.625, |
|
"grad_norm": 0.4015868947675386, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 3.625, |
|
"eval_loss": 0.7341201305389404, |
|
"eval_runtime": 45.9888, |
|
"eval_samples_per_second": 4.349, |
|
"eval_steps_per_second": 0.544, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 3.65625, |
|
"grad_norm": 0.3908261734781783, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7903, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 3.65625, |
|
"eval_loss": 0.7336520552635193, |
|
"eval_runtime": 45.9012, |
|
"eval_samples_per_second": 4.357, |
|
"eval_steps_per_second": 0.545, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 3.6875, |
|
"grad_norm": 0.39497646856232355, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8072, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 3.6875, |
|
"eval_loss": 0.7335306406021118, |
|
"eval_runtime": 46.2389, |
|
"eval_samples_per_second": 4.325, |
|
"eval_steps_per_second": 0.541, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 3.71875, |
|
"grad_norm": 0.3773137872461335, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8647, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 3.71875, |
|
"eval_loss": 0.7331534028053284, |
|
"eval_runtime": 46.662, |
|
"eval_samples_per_second": 4.286, |
|
"eval_steps_per_second": 0.536, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 0.353841599712999, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8076, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"eval_loss": 0.732619047164917, |
|
"eval_runtime": 47.5847, |
|
"eval_samples_per_second": 4.203, |
|
"eval_steps_per_second": 0.525, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 3.78125, |
|
"grad_norm": 0.38703604888096965, |
|
"learning_rate": 2e-05, |
|
"loss": 0.783, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 3.78125, |
|
"eval_loss": 0.7308679223060608, |
|
"eval_runtime": 47.3672, |
|
"eval_samples_per_second": 4.222, |
|
"eval_steps_per_second": 0.528, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 3.8125, |
|
"grad_norm": 0.406784109988961, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8592, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 3.8125, |
|
"eval_loss": 0.7294270396232605, |
|
"eval_runtime": 46.3156, |
|
"eval_samples_per_second": 4.318, |
|
"eval_steps_per_second": 0.54, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 3.84375, |
|
"grad_norm": 0.3867362432665531, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7773, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 3.84375, |
|
"eval_loss": 0.7278974056243896, |
|
"eval_runtime": 46.0714, |
|
"eval_samples_per_second": 4.341, |
|
"eval_steps_per_second": 0.543, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 3.875, |
|
"grad_norm": 0.37454905814944983, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8054, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 3.875, |
|
"eval_loss": 0.7264491319656372, |
|
"eval_runtime": 46.0579, |
|
"eval_samples_per_second": 4.342, |
|
"eval_steps_per_second": 0.543, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 3.90625, |
|
"grad_norm": 0.444384159363942, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8434, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 3.90625, |
|
"eval_loss": 0.7248883843421936, |
|
"eval_runtime": 46.2593, |
|
"eval_samples_per_second": 4.323, |
|
"eval_steps_per_second": 0.54, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 3.9375, |
|
"grad_norm": 0.4296603454332508, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8154, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 3.9375, |
|
"eval_loss": 0.7236350774765015, |
|
"eval_runtime": 47.8167, |
|
"eval_samples_per_second": 4.183, |
|
"eval_steps_per_second": 0.523, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 3.96875, |
|
"grad_norm": 0.4369101294390371, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7759, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 3.96875, |
|
"eval_loss": 0.7224241495132446, |
|
"eval_runtime": 45.8583, |
|
"eval_samples_per_second": 4.361, |
|
"eval_steps_per_second": 0.545, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.4294598409798285, |
|
"learning_rate": 2e-05, |
|
"loss": 0.706, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 0.7210729718208313, |
|
"eval_runtime": 45.9047, |
|
"eval_samples_per_second": 4.357, |
|
"eval_steps_per_second": 0.545, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 4.03125, |
|
"grad_norm": 0.355178274167416, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7969, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 4.03125, |
|
"eval_loss": 0.7206510901451111, |
|
"eval_runtime": 46.1016, |
|
"eval_samples_per_second": 4.338, |
|
"eval_steps_per_second": 0.542, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 4.0625, |
|
"grad_norm": 0.39855476598487416, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8124, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 4.0625, |
|
"eval_loss": 0.7203733921051025, |
|
"eval_runtime": 46.5052, |
|
"eval_samples_per_second": 4.301, |
|
"eval_steps_per_second": 0.538, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 4.09375, |
|
"grad_norm": 0.38252767359910733, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8126, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 4.09375, |
|
"eval_loss": 0.7201277017593384, |
|
"eval_runtime": 47.5144, |
|
"eval_samples_per_second": 4.209, |
|
"eval_steps_per_second": 0.526, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 4.125, |
|
"grad_norm": 0.44006887742113143, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7706, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 4.125, |
|
"eval_loss": 0.7195135354995728, |
|
"eval_runtime": 45.8417, |
|
"eval_samples_per_second": 4.363, |
|
"eval_steps_per_second": 0.545, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 4.15625, |
|
"grad_norm": 0.426129225179819, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8699, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 4.15625, |
|
"eval_loss": 0.7189508080482483, |
|
"eval_runtime": 46.2247, |
|
"eval_samples_per_second": 4.327, |
|
"eval_steps_per_second": 0.541, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 4.1875, |
|
"grad_norm": 0.4995092725647276, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7811, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 4.1875, |
|
"eval_loss": 0.7180965542793274, |
|
"eval_runtime": 46.4605, |
|
"eval_samples_per_second": 4.305, |
|
"eval_steps_per_second": 0.538, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 4.21875, |
|
"grad_norm": 0.42664484060733815, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7795, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 4.21875, |
|
"eval_loss": 0.7173775434494019, |
|
"eval_runtime": 46.1896, |
|
"eval_samples_per_second": 4.33, |
|
"eval_steps_per_second": 0.541, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"grad_norm": 0.43970733071879864, |
|
"learning_rate": 2e-05, |
|
"loss": 0.772, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"eval_loss": 0.716987133026123, |
|
"eval_runtime": 45.88, |
|
"eval_samples_per_second": 4.359, |
|
"eval_steps_per_second": 0.545, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 4.28125, |
|
"grad_norm": 0.4585774179958974, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7594, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 4.28125, |
|
"eval_loss": 0.7162837386131287, |
|
"eval_runtime": 45.9687, |
|
"eval_samples_per_second": 4.351, |
|
"eval_steps_per_second": 0.544, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 4.3125, |
|
"grad_norm": 0.4482018280143517, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7702, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 4.3125, |
|
"eval_loss": 0.7155399918556213, |
|
"eval_runtime": 46.1566, |
|
"eval_samples_per_second": 4.333, |
|
"eval_steps_per_second": 0.542, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 4.34375, |
|
"grad_norm": 0.44262087649988896, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7323, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 4.34375, |
|
"eval_loss": 0.7145451307296753, |
|
"eval_runtime": 46.2257, |
|
"eval_samples_per_second": 4.327, |
|
"eval_steps_per_second": 0.541, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 4.375, |
|
"grad_norm": 0.4418100350036369, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7669, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 4.375, |
|
"eval_loss": 0.7139186263084412, |
|
"eval_runtime": 46.1994, |
|
"eval_samples_per_second": 4.329, |
|
"eval_steps_per_second": 0.541, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 4.40625, |
|
"grad_norm": 0.4068223149751762, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7806, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 4.40625, |
|
"eval_loss": 0.7134376764297485, |
|
"eval_runtime": 48.1068, |
|
"eval_samples_per_second": 4.157, |
|
"eval_steps_per_second": 0.52, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 4.4375, |
|
"grad_norm": 0.4339025102618351, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7312, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 4.4375, |
|
"eval_loss": 0.7134268879890442, |
|
"eval_runtime": 46.8951, |
|
"eval_samples_per_second": 4.265, |
|
"eval_steps_per_second": 0.533, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 4.46875, |
|
"grad_norm": 0.45474838622605346, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7358, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 4.46875, |
|
"eval_loss": 0.7131960391998291, |
|
"eval_runtime": 46.8155, |
|
"eval_samples_per_second": 4.272, |
|
"eval_steps_per_second": 0.534, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"grad_norm": 0.4284980958119551, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7146, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"eval_loss": 0.7122372388839722, |
|
"eval_runtime": 46.7899, |
|
"eval_samples_per_second": 4.274, |
|
"eval_steps_per_second": 0.534, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 4.53125, |
|
"grad_norm": 0.4679473362578349, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8018, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 4.53125, |
|
"eval_loss": 0.7106640338897705, |
|
"eval_runtime": 46.845, |
|
"eval_samples_per_second": 4.269, |
|
"eval_steps_per_second": 0.534, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 4.5625, |
|
"grad_norm": 0.4900067169351881, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6884, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 4.5625, |
|
"eval_loss": 0.7087500095367432, |
|
"eval_runtime": 47.5958, |
|
"eval_samples_per_second": 4.202, |
|
"eval_steps_per_second": 0.525, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 4.59375, |
|
"grad_norm": 0.4734076525152252, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7491, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 4.59375, |
|
"eval_loss": 0.7072947025299072, |
|
"eval_runtime": 48.7251, |
|
"eval_samples_per_second": 4.105, |
|
"eval_steps_per_second": 0.513, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 4.625, |
|
"grad_norm": 0.44251158400098356, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7052, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 4.625, |
|
"eval_loss": 0.7068507671356201, |
|
"eval_runtime": 47.7025, |
|
"eval_samples_per_second": 4.193, |
|
"eval_steps_per_second": 0.524, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 4.65625, |
|
"grad_norm": 0.4304625716692019, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8176, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 4.65625, |
|
"eval_loss": 0.7074388265609741, |
|
"eval_runtime": 48.6321, |
|
"eval_samples_per_second": 4.113, |
|
"eval_steps_per_second": 0.514, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 4.6875, |
|
"grad_norm": 0.5157530943388945, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7429, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 4.6875, |
|
"eval_loss": 0.7071186900138855, |
|
"eval_runtime": 47.9557, |
|
"eval_samples_per_second": 4.171, |
|
"eval_steps_per_second": 0.521, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 4.71875, |
|
"grad_norm": 0.5469994539610319, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7643, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 4.71875, |
|
"eval_loss": 0.7050415277481079, |
|
"eval_runtime": 47.5207, |
|
"eval_samples_per_second": 4.209, |
|
"eval_steps_per_second": 0.526, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"grad_norm": 0.4821891223190419, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7795, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"eval_loss": 0.7032743692398071, |
|
"eval_runtime": 47.2902, |
|
"eval_samples_per_second": 4.229, |
|
"eval_steps_per_second": 0.529, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 4.78125, |
|
"grad_norm": 0.4785594997922253, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7323, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 4.78125, |
|
"eval_loss": 0.7028358578681946, |
|
"eval_runtime": 47.7841, |
|
"eval_samples_per_second": 4.185, |
|
"eval_steps_per_second": 0.523, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 4.8125, |
|
"grad_norm": 0.47200733754346447, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7555, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 4.8125, |
|
"eval_loss": 0.7034148573875427, |
|
"eval_runtime": 47.4952, |
|
"eval_samples_per_second": 4.211, |
|
"eval_steps_per_second": 0.526, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 4.84375, |
|
"grad_norm": 0.49226670914533455, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6884, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 4.84375, |
|
"eval_loss": 0.7038142681121826, |
|
"eval_runtime": 47.6873, |
|
"eval_samples_per_second": 4.194, |
|
"eval_steps_per_second": 0.524, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 4.875, |
|
"grad_norm": 0.4894781168701622, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8079, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 4.875, |
|
"eval_loss": 0.7031099200248718, |
|
"eval_runtime": 47.0438, |
|
"eval_samples_per_second": 4.251, |
|
"eval_steps_per_second": 0.531, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 4.90625, |
|
"grad_norm": 0.44465660848434874, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7868, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 4.90625, |
|
"eval_loss": 0.7025811672210693, |
|
"eval_runtime": 47.2897, |
|
"eval_samples_per_second": 4.229, |
|
"eval_steps_per_second": 0.529, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 4.9375, |
|
"grad_norm": 0.4671993515654777, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7949, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 4.9375, |
|
"eval_loss": 0.7016230225563049, |
|
"eval_runtime": 48.7147, |
|
"eval_samples_per_second": 4.106, |
|
"eval_steps_per_second": 0.513, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 4.96875, |
|
"grad_norm": 0.46593892888464733, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7445, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 4.96875, |
|
"eval_loss": 0.7006258964538574, |
|
"eval_runtime": 48.5723, |
|
"eval_samples_per_second": 4.118, |
|
"eval_steps_per_second": 0.515, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.47383657575274585, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7233, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 0.7000269889831543, |
|
"eval_runtime": 48.7517, |
|
"eval_samples_per_second": 4.102, |
|
"eval_steps_per_second": 0.513, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 5.03125, |
|
"grad_norm": 0.42723336337060835, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7061, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 5.03125, |
|
"eval_loss": 0.7001045942306519, |
|
"eval_runtime": 51.0355, |
|
"eval_samples_per_second": 3.919, |
|
"eval_steps_per_second": 0.49, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 5.0625, |
|
"grad_norm": 0.452950592019195, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8489, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 5.0625, |
|
"eval_loss": 0.7011143565177917, |
|
"eval_runtime": 44.0195, |
|
"eval_samples_per_second": 4.543, |
|
"eval_steps_per_second": 0.568, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 5.09375, |
|
"grad_norm": 0.49095068041556844, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6523, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 5.09375, |
|
"eval_loss": 0.7020147442817688, |
|
"eval_runtime": 43.9994, |
|
"eval_samples_per_second": 4.546, |
|
"eval_steps_per_second": 0.568, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 5.125, |
|
"grad_norm": 0.49702685752637826, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7931, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 5.125, |
|
"eval_loss": 0.7026366591453552, |
|
"eval_runtime": 43.7736, |
|
"eval_samples_per_second": 4.569, |
|
"eval_steps_per_second": 0.571, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 5.15625, |
|
"grad_norm": 0.5894972181165574, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6297, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 5.15625, |
|
"eval_loss": 0.7018793225288391, |
|
"eval_runtime": 43.8277, |
|
"eval_samples_per_second": 4.563, |
|
"eval_steps_per_second": 0.57, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 5.1875, |
|
"grad_norm": 0.5431599726243479, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7394, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 5.1875, |
|
"eval_loss": 0.701405942440033, |
|
"eval_runtime": 46.007, |
|
"eval_samples_per_second": 4.347, |
|
"eval_steps_per_second": 0.543, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 5.21875, |
|
"grad_norm": 0.46081080554385206, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7587, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 5.21875, |
|
"eval_loss": 0.7011873126029968, |
|
"eval_runtime": 45.6739, |
|
"eval_samples_per_second": 4.379, |
|
"eval_steps_per_second": 0.547, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 5.25, |
|
"grad_norm": 0.5186784959253576, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7944, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 5.25, |
|
"eval_loss": 0.7006779313087463, |
|
"eval_runtime": 46.6382, |
|
"eval_samples_per_second": 4.288, |
|
"eval_steps_per_second": 0.536, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 5.28125, |
|
"grad_norm": 0.484045023962852, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7149, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 5.28125, |
|
"eval_loss": 0.7005323171615601, |
|
"eval_runtime": 45.7584, |
|
"eval_samples_per_second": 4.371, |
|
"eval_steps_per_second": 0.546, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 5.3125, |
|
"grad_norm": 0.5719751134907255, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6939, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 5.3125, |
|
"eval_loss": 0.7002266645431519, |
|
"eval_runtime": 45.9679, |
|
"eval_samples_per_second": 4.351, |
|
"eval_steps_per_second": 0.544, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 5.34375, |
|
"grad_norm": 0.6060894153712378, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7048, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 5.34375, |
|
"eval_loss": 0.6983186602592468, |
|
"eval_runtime": 47.2598, |
|
"eval_samples_per_second": 4.232, |
|
"eval_steps_per_second": 0.529, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 5.375, |
|
"grad_norm": 0.5548499769346423, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7881, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 5.375, |
|
"eval_loss": 0.6966648697853088, |
|
"eval_runtime": 47.0803, |
|
"eval_samples_per_second": 4.248, |
|
"eval_steps_per_second": 0.531, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 5.40625, |
|
"grad_norm": 0.5102316819603098, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7542, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 5.40625, |
|
"eval_loss": 0.6953878998756409, |
|
"eval_runtime": 48.3238, |
|
"eval_samples_per_second": 4.139, |
|
"eval_steps_per_second": 0.517, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 5.4375, |
|
"grad_norm": 0.5399890621278476, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7937, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 5.4375, |
|
"eval_loss": 0.69431471824646, |
|
"eval_runtime": 49.2122, |
|
"eval_samples_per_second": 4.064, |
|
"eval_steps_per_second": 0.508, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 5.46875, |
|
"grad_norm": 0.5252423839534397, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7767, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 5.46875, |
|
"eval_loss": 0.6944937109947205, |
|
"eval_runtime": 49.0039, |
|
"eval_samples_per_second": 4.081, |
|
"eval_steps_per_second": 0.51, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 5.5, |
|
"grad_norm": 0.5422683424689886, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7171, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 5.5, |
|
"eval_loss": 0.6943515539169312, |
|
"eval_runtime": 48.7295, |
|
"eval_samples_per_second": 4.104, |
|
"eval_steps_per_second": 0.513, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 5.53125, |
|
"grad_norm": 0.551339022612633, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7529, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 5.53125, |
|
"eval_loss": 0.6935855150222778, |
|
"eval_runtime": 50.259, |
|
"eval_samples_per_second": 3.979, |
|
"eval_steps_per_second": 0.497, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 5.5625, |
|
"grad_norm": 0.5040662348893271, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7816, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 5.5625, |
|
"eval_loss": 0.6929727792739868, |
|
"eval_runtime": 49.9267, |
|
"eval_samples_per_second": 4.006, |
|
"eval_steps_per_second": 0.501, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 5.59375, |
|
"grad_norm": 0.538094993002792, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6785, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 5.59375, |
|
"eval_loss": 0.6930323839187622, |
|
"eval_runtime": 48.28, |
|
"eval_samples_per_second": 4.143, |
|
"eval_steps_per_second": 0.518, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 5.625, |
|
"grad_norm": 0.5367726605699668, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6868, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 5.625, |
|
"eval_loss": 0.6928802728652954, |
|
"eval_runtime": 49.8478, |
|
"eval_samples_per_second": 4.012, |
|
"eval_steps_per_second": 0.502, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 5.65625, |
|
"grad_norm": 0.5978542074838507, |
|
"learning_rate": 2e-05, |
|
"loss": 0.698, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 5.65625, |
|
"eval_loss": 0.6921787858009338, |
|
"eval_runtime": 50.778, |
|
"eval_samples_per_second": 3.939, |
|
"eval_steps_per_second": 0.492, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 5.6875, |
|
"grad_norm": 0.5779173967988954, |
|
"learning_rate": 2e-05, |
|
"loss": 0.664, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 5.6875, |
|
"eval_loss": 0.6921034455299377, |
|
"eval_runtime": 49.7171, |
|
"eval_samples_per_second": 4.023, |
|
"eval_steps_per_second": 0.503, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 5.71875, |
|
"grad_norm": 0.6377165996743129, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7051, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 5.71875, |
|
"eval_loss": 0.6914942264556885, |
|
"eval_runtime": 51.9608, |
|
"eval_samples_per_second": 3.849, |
|
"eval_steps_per_second": 0.481, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 5.75, |
|
"grad_norm": 0.6093388082076064, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6903, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 5.75, |
|
"eval_loss": 0.6904594302177429, |
|
"eval_runtime": 49.6144, |
|
"eval_samples_per_second": 4.031, |
|
"eval_steps_per_second": 0.504, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 5.78125, |
|
"grad_norm": 0.5987747297973711, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7368, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 5.78125, |
|
"eval_loss": 0.6894869804382324, |
|
"eval_runtime": 49.7122, |
|
"eval_samples_per_second": 4.023, |
|
"eval_steps_per_second": 0.503, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 5.8125, |
|
"grad_norm": 0.5914952733954625, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7003, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 5.8125, |
|
"eval_loss": 0.6885225772857666, |
|
"eval_runtime": 49.8474, |
|
"eval_samples_per_second": 4.012, |
|
"eval_steps_per_second": 0.502, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 5.84375, |
|
"grad_norm": 0.5641237505681922, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7571, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 5.84375, |
|
"eval_loss": 0.6889610290527344, |
|
"eval_runtime": 51.5925, |
|
"eval_samples_per_second": 3.877, |
|
"eval_steps_per_second": 0.485, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 5.875, |
|
"grad_norm": 0.5566285784572296, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6882, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 5.875, |
|
"eval_loss": 0.6903389692306519, |
|
"eval_runtime": 49.713, |
|
"eval_samples_per_second": 4.023, |
|
"eval_steps_per_second": 0.503, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 5.90625, |
|
"grad_norm": 0.5594562993560854, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7028, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 5.90625, |
|
"eval_loss": 0.6911373734474182, |
|
"eval_runtime": 49.929, |
|
"eval_samples_per_second": 4.006, |
|
"eval_steps_per_second": 0.501, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 5.9375, |
|
"grad_norm": 0.6114177699067616, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7181, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 5.9375, |
|
"eval_loss": 0.6901592016220093, |
|
"eval_runtime": 49.9032, |
|
"eval_samples_per_second": 4.008, |
|
"eval_steps_per_second": 0.501, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 5.96875, |
|
"grad_norm": 0.5564307101453613, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7116, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 5.96875, |
|
"eval_loss": 0.6883879899978638, |
|
"eval_runtime": 49.9457, |
|
"eval_samples_per_second": 4.004, |
|
"eval_steps_per_second": 0.501, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 0.5242139835965315, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6956, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 0.686991274356842, |
|
"eval_runtime": 51.3206, |
|
"eval_samples_per_second": 3.897, |
|
"eval_steps_per_second": 0.487, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 6.03125, |
|
"grad_norm": 0.5661038874224659, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7667, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 6.03125, |
|
"eval_loss": 0.6863989233970642, |
|
"eval_runtime": 50.3486, |
|
"eval_samples_per_second": 3.972, |
|
"eval_steps_per_second": 0.497, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 6.0625, |
|
"grad_norm": 0.5015705892320539, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7289, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 6.0625, |
|
"eval_loss": 0.6869972348213196, |
|
"eval_runtime": 51.6966, |
|
"eval_samples_per_second": 3.869, |
|
"eval_steps_per_second": 0.484, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 6.09375, |
|
"grad_norm": 0.5679476318211268, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6595, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 6.09375, |
|
"eval_loss": 0.6878303289413452, |
|
"eval_runtime": 44.1921, |
|
"eval_samples_per_second": 4.526, |
|
"eval_steps_per_second": 0.566, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 6.125, |
|
"grad_norm": 0.5496769650020654, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6934, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 6.125, |
|
"eval_loss": 0.689085841178894, |
|
"eval_runtime": 44.0432, |
|
"eval_samples_per_second": 4.541, |
|
"eval_steps_per_second": 0.568, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 6.15625, |
|
"grad_norm": 0.5761731163916711, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7212, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 6.15625, |
|
"eval_loss": 0.6919547915458679, |
|
"eval_runtime": 45.3631, |
|
"eval_samples_per_second": 4.409, |
|
"eval_steps_per_second": 0.551, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 6.1875, |
|
"grad_norm": 0.6093485410765964, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8013, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 6.1875, |
|
"eval_loss": 0.6936098337173462, |
|
"eval_runtime": 44.1956, |
|
"eval_samples_per_second": 4.525, |
|
"eval_steps_per_second": 0.566, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 6.21875, |
|
"grad_norm": 0.6670365325797192, |
|
"learning_rate": 2e-05, |
|
"loss": 0.666, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 6.21875, |
|
"eval_loss": 0.693129301071167, |
|
"eval_runtime": 44.0131, |
|
"eval_samples_per_second": 4.544, |
|
"eval_steps_per_second": 0.568, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 6.25, |
|
"grad_norm": 0.6464592274733308, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7134, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 6.25, |
|
"eval_loss": 0.6912326216697693, |
|
"eval_runtime": 44.0, |
|
"eval_samples_per_second": 4.545, |
|
"eval_steps_per_second": 0.568, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 6.28125, |
|
"grad_norm": 0.6088225232188101, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7405, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 6.28125, |
|
"eval_loss": 0.6896650195121765, |
|
"eval_runtime": 44.3194, |
|
"eval_samples_per_second": 4.513, |
|
"eval_steps_per_second": 0.564, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 6.3125, |
|
"grad_norm": 0.6638309972807995, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6542, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 6.3125, |
|
"eval_loss": 0.6878445148468018, |
|
"eval_runtime": 44.2101, |
|
"eval_samples_per_second": 4.524, |
|
"eval_steps_per_second": 0.565, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 6.34375, |
|
"grad_norm": 0.5632348029553863, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7953, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 6.34375, |
|
"eval_loss": 0.6869116425514221, |
|
"eval_runtime": 44.0039, |
|
"eval_samples_per_second": 4.545, |
|
"eval_steps_per_second": 0.568, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 6.375, |
|
"grad_norm": 0.6753158068984167, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6369, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 6.375, |
|
"eval_loss": 0.6856124997138977, |
|
"eval_runtime": 44.2493, |
|
"eval_samples_per_second": 4.52, |
|
"eval_steps_per_second": 0.565, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 6.40625, |
|
"grad_norm": 0.5601655147962107, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6291, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 6.40625, |
|
"eval_loss": 0.685504138469696, |
|
"eval_runtime": 43.9463, |
|
"eval_samples_per_second": 4.551, |
|
"eval_steps_per_second": 0.569, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 6.4375, |
|
"grad_norm": 0.6578412065562369, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6887, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 6.4375, |
|
"eval_loss": 0.6858142018318176, |
|
"eval_runtime": 45.1556, |
|
"eval_samples_per_second": 4.429, |
|
"eval_steps_per_second": 0.554, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 6.46875, |
|
"grad_norm": 0.6149787250576099, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7375, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 6.46875, |
|
"eval_loss": 0.6860241889953613, |
|
"eval_runtime": 44.9447, |
|
"eval_samples_per_second": 4.45, |
|
"eval_steps_per_second": 0.556, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 6.5, |
|
"grad_norm": 0.6674521606961297, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6856, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 6.5, |
|
"eval_loss": 0.6866363286972046, |
|
"eval_runtime": 44.714, |
|
"eval_samples_per_second": 4.473, |
|
"eval_steps_per_second": 0.559, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 6.53125, |
|
"grad_norm": 0.700420859386899, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6556, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 6.53125, |
|
"eval_loss": 0.6870286464691162, |
|
"eval_runtime": 44.8923, |
|
"eval_samples_per_second": 4.455, |
|
"eval_steps_per_second": 0.557, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 6.5625, |
|
"grad_norm": 0.6530651968630973, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6334, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 6.5625, |
|
"eval_loss": 0.6872709393501282, |
|
"eval_runtime": 44.7944, |
|
"eval_samples_per_second": 4.465, |
|
"eval_steps_per_second": 0.558, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 6.59375, |
|
"grad_norm": 0.695757498482456, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6784, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 6.59375, |
|
"eval_loss": 0.6869171857833862, |
|
"eval_runtime": 45.755, |
|
"eval_samples_per_second": 4.371, |
|
"eval_steps_per_second": 0.546, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 6.625, |
|
"grad_norm": 0.642060810781652, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6489, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 6.625, |
|
"eval_loss": 0.685666024684906, |
|
"eval_runtime": 46.4458, |
|
"eval_samples_per_second": 4.306, |
|
"eval_steps_per_second": 0.538, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 6.65625, |
|
"grad_norm": 0.6088750940603561, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7216, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 6.65625, |
|
"eval_loss": 0.6843697428703308, |
|
"eval_runtime": 46.1389, |
|
"eval_samples_per_second": 4.335, |
|
"eval_steps_per_second": 0.542, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 6.6875, |
|
"grad_norm": 0.6043945628080053, |
|
"learning_rate": 2e-05, |
|
"loss": 0.692, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 6.6875, |
|
"eval_loss": 0.6836680769920349, |
|
"eval_runtime": 47.7324, |
|
"eval_samples_per_second": 4.19, |
|
"eval_steps_per_second": 0.524, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 6.71875, |
|
"grad_norm": 0.6506615838970475, |
|
"learning_rate": 2e-05, |
|
"loss": 0.691, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 6.71875, |
|
"eval_loss": 0.6824812293052673, |
|
"eval_runtime": 45.8056, |
|
"eval_samples_per_second": 4.366, |
|
"eval_steps_per_second": 0.546, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 6.75, |
|
"grad_norm": 0.6878268158673746, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6894, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 6.75, |
|
"eval_loss": 0.6817054748535156, |
|
"eval_runtime": 46.47, |
|
"eval_samples_per_second": 4.304, |
|
"eval_steps_per_second": 0.538, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 6.78125, |
|
"grad_norm": 0.6793999118325932, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6394, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 6.78125, |
|
"eval_loss": 0.6831635236740112, |
|
"eval_runtime": 47.8532, |
|
"eval_samples_per_second": 4.179, |
|
"eval_steps_per_second": 0.522, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 6.8125, |
|
"grad_norm": 0.6935365262523343, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6341, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 6.8125, |
|
"eval_loss": 0.6843095421791077, |
|
"eval_runtime": 46.3828, |
|
"eval_samples_per_second": 4.312, |
|
"eval_steps_per_second": 0.539, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 6.84375, |
|
"grad_norm": 0.8071019513751874, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7211, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 6.84375, |
|
"eval_loss": 0.6839814782142639, |
|
"eval_runtime": 46.5771, |
|
"eval_samples_per_second": 4.294, |
|
"eval_steps_per_second": 0.537, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 6.875, |
|
"grad_norm": 0.7202535741704769, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7305, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 6.875, |
|
"eval_loss": 0.6822354197502136, |
|
"eval_runtime": 46.6149, |
|
"eval_samples_per_second": 4.29, |
|
"eval_steps_per_second": 0.536, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 6.90625, |
|
"grad_norm": 0.6829442890004696, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6965, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 6.90625, |
|
"eval_loss": 0.6804749369621277, |
|
"eval_runtime": 47.9027, |
|
"eval_samples_per_second": 4.175, |
|
"eval_steps_per_second": 0.522, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 6.9375, |
|
"grad_norm": 0.7007337811403486, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6948, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 6.9375, |
|
"eval_loss": 0.6785742044448853, |
|
"eval_runtime": 48.3484, |
|
"eval_samples_per_second": 4.137, |
|
"eval_steps_per_second": 0.517, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 6.96875, |
|
"grad_norm": 0.6672225040660534, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7075, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 6.96875, |
|
"eval_loss": 0.6771878004074097, |
|
"eval_runtime": 46.3836, |
|
"eval_samples_per_second": 4.312, |
|
"eval_steps_per_second": 0.539, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 0.6893374424350143, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7652, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 0.6772673726081848, |
|
"eval_runtime": 47.0913, |
|
"eval_samples_per_second": 4.247, |
|
"eval_steps_per_second": 0.531, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 7.03125, |
|
"grad_norm": 0.5866908507437849, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6784, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 7.03125, |
|
"eval_loss": 0.6778077483177185, |
|
"eval_runtime": 46.7766, |
|
"eval_samples_per_second": 4.276, |
|
"eval_steps_per_second": 0.534, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 7.0625, |
|
"grad_norm": 0.6620785641323407, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6107, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 7.0625, |
|
"eval_loss": 0.6797336339950562, |
|
"eval_runtime": 47.0779, |
|
"eval_samples_per_second": 4.248, |
|
"eval_steps_per_second": 0.531, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 7.09375, |
|
"grad_norm": 0.6646660025868149, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6824, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 7.09375, |
|
"eval_loss": 0.6831703186035156, |
|
"eval_runtime": 46.4223, |
|
"eval_samples_per_second": 4.308, |
|
"eval_steps_per_second": 0.539, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 7.125, |
|
"grad_norm": 0.7653429329219695, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6289, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 7.125, |
|
"eval_loss": 0.6889806985855103, |
|
"eval_runtime": 48.2668, |
|
"eval_samples_per_second": 4.144, |
|
"eval_steps_per_second": 0.518, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 7.15625, |
|
"grad_norm": 0.888507299589656, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6405, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 7.15625, |
|
"eval_loss": 0.6938297748565674, |
|
"eval_runtime": 48.2833, |
|
"eval_samples_per_second": 4.142, |
|
"eval_steps_per_second": 0.518, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 7.1875, |
|
"grad_norm": 0.8483995966585272, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6256, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 7.1875, |
|
"eval_loss": 0.6941313147544861, |
|
"eval_runtime": 46.6028, |
|
"eval_samples_per_second": 4.292, |
|
"eval_steps_per_second": 0.536, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 7.21875, |
|
"grad_norm": 0.8529011065789557, |
|
"learning_rate": 2e-05, |
|
"loss": 0.719, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 7.21875, |
|
"eval_loss": 0.6908813714981079, |
|
"eval_runtime": 47.7668, |
|
"eval_samples_per_second": 4.187, |
|
"eval_steps_per_second": 0.523, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 7.25, |
|
"grad_norm": 0.7891947191711363, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7122, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 7.25, |
|
"eval_loss": 0.6873031854629517, |
|
"eval_runtime": 46.9441, |
|
"eval_samples_per_second": 4.26, |
|
"eval_steps_per_second": 0.533, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 7.28125, |
|
"grad_norm": 0.8410831266636205, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6655, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 7.28125, |
|
"eval_loss": 0.6842228174209595, |
|
"eval_runtime": 48.184, |
|
"eval_samples_per_second": 4.151, |
|
"eval_steps_per_second": 0.519, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 7.3125, |
|
"grad_norm": 0.7543966645145809, |
|
"learning_rate": 2e-05, |
|
"loss": 0.702, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 7.3125, |
|
"eval_loss": 0.6826092600822449, |
|
"eval_runtime": 48.7587, |
|
"eval_samples_per_second": 4.102, |
|
"eval_steps_per_second": 0.513, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 7.34375, |
|
"grad_norm": 0.69863349246919, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6676, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 7.34375, |
|
"eval_loss": 0.6820936799049377, |
|
"eval_runtime": 46.5095, |
|
"eval_samples_per_second": 4.3, |
|
"eval_steps_per_second": 0.538, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 7.375, |
|
"grad_norm": 0.7718198795174328, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6322, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 7.375, |
|
"eval_loss": 0.681590735912323, |
|
"eval_runtime": 47.6491, |
|
"eval_samples_per_second": 4.197, |
|
"eval_steps_per_second": 0.525, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 7.40625, |
|
"grad_norm": 0.8032644336352275, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6835, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 7.40625, |
|
"eval_loss": 0.6806458234786987, |
|
"eval_runtime": 47.1412, |
|
"eval_samples_per_second": 4.243, |
|
"eval_steps_per_second": 0.53, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 7.4375, |
|
"grad_norm": 0.8165151350063435, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6744, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 7.4375, |
|
"eval_loss": 0.6802331805229187, |
|
"eval_runtime": 48.2476, |
|
"eval_samples_per_second": 4.145, |
|
"eval_steps_per_second": 0.518, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 7.46875, |
|
"grad_norm": 0.7665175082054141, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6955, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 7.46875, |
|
"eval_loss": 0.6806652545928955, |
|
"eval_runtime": 46.6541, |
|
"eval_samples_per_second": 4.287, |
|
"eval_steps_per_second": 0.536, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"grad_norm": 0.7584547487112137, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6374, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"eval_loss": 0.6825945973396301, |
|
"eval_runtime": 46.3848, |
|
"eval_samples_per_second": 4.312, |
|
"eval_steps_per_second": 0.539, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 7.53125, |
|
"grad_norm": 0.660822695597991, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6825, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 7.53125, |
|
"eval_loss": 0.6861986517906189, |
|
"eval_runtime": 46.2732, |
|
"eval_samples_per_second": 4.322, |
|
"eval_steps_per_second": 0.54, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 7.5625, |
|
"grad_norm": 0.7793836425815985, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6824, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 7.5625, |
|
"eval_loss": 0.6895106434822083, |
|
"eval_runtime": 46.6462, |
|
"eval_samples_per_second": 4.288, |
|
"eval_steps_per_second": 0.536, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 7.59375, |
|
"grad_norm": 0.8237113294656135, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6604, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 7.59375, |
|
"eval_loss": 0.6898853778839111, |
|
"eval_runtime": 46.7904, |
|
"eval_samples_per_second": 4.274, |
|
"eval_steps_per_second": 0.534, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 7.625, |
|
"grad_norm": 0.9966126829271594, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7297, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 7.625, |
|
"eval_loss": 0.6854925751686096, |
|
"eval_runtime": 46.5541, |
|
"eval_samples_per_second": 4.296, |
|
"eval_steps_per_second": 0.537, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 7.65625, |
|
"grad_norm": 0.7581680879353856, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6319, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 7.65625, |
|
"eval_loss": 0.6836807131767273, |
|
"eval_runtime": 48.3404, |
|
"eval_samples_per_second": 4.137, |
|
"eval_steps_per_second": 0.517, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 7.6875, |
|
"grad_norm": 0.799947909805063, |
|
"learning_rate": 2e-05, |
|
"loss": 0.672, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 7.6875, |
|
"eval_loss": 0.681761622428894, |
|
"eval_runtime": 50.0597, |
|
"eval_samples_per_second": 3.995, |
|
"eval_steps_per_second": 0.499, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 7.71875, |
|
"grad_norm": 0.8377626405796506, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6727, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 7.71875, |
|
"eval_loss": 0.6791908144950867, |
|
"eval_runtime": 49.25, |
|
"eval_samples_per_second": 4.061, |
|
"eval_steps_per_second": 0.508, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 7.75, |
|
"grad_norm": 0.7237789197029182, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6576, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 7.75, |
|
"eval_loss": 0.6767004132270813, |
|
"eval_runtime": 48.5162, |
|
"eval_samples_per_second": 4.122, |
|
"eval_steps_per_second": 0.515, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 7.78125, |
|
"grad_norm": 0.7946831722044173, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7029, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 7.78125, |
|
"eval_loss": 0.675483763217926, |
|
"eval_runtime": 49.9932, |
|
"eval_samples_per_second": 4.001, |
|
"eval_steps_per_second": 0.5, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 7.8125, |
|
"grad_norm": 0.7259305030593936, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7109, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 7.8125, |
|
"eval_loss": 0.6768932938575745, |
|
"eval_runtime": 49.852, |
|
"eval_samples_per_second": 4.012, |
|
"eval_steps_per_second": 0.501, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 7.84375, |
|
"grad_norm": 0.7340863248905795, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6231, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 7.84375, |
|
"eval_loss": 0.6790910363197327, |
|
"eval_runtime": 51.2892, |
|
"eval_samples_per_second": 3.899, |
|
"eval_steps_per_second": 0.487, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 7.875, |
|
"grad_norm": 0.8413325044551803, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6325, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 7.875, |
|
"eval_loss": 0.6796602010726929, |
|
"eval_runtime": 51.5508, |
|
"eval_samples_per_second": 3.88, |
|
"eval_steps_per_second": 0.485, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 7.90625, |
|
"grad_norm": 0.7927416396360353, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7207, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 7.90625, |
|
"eval_loss": 0.6797543168067932, |
|
"eval_runtime": 51.7355, |
|
"eval_samples_per_second": 3.866, |
|
"eval_steps_per_second": 0.483, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 7.9375, |
|
"grad_norm": 0.7510046984656369, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6728, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 7.9375, |
|
"eval_loss": 0.6813901662826538, |
|
"eval_runtime": 50.2001, |
|
"eval_samples_per_second": 3.984, |
|
"eval_steps_per_second": 0.498, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 7.96875, |
|
"grad_norm": 0.8061013994114622, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6006, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 7.96875, |
|
"eval_loss": 0.681613028049469, |
|
"eval_runtime": 49.7101, |
|
"eval_samples_per_second": 4.023, |
|
"eval_steps_per_second": 0.503, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 0.7889275388211946, |
|
"learning_rate": 2e-05, |
|
"loss": 0.662, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 0.6804400086402893, |
|
"eval_runtime": 51.28, |
|
"eval_samples_per_second": 3.9, |
|
"eval_steps_per_second": 0.488, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 8.03125, |
|
"grad_norm": 0.7870763956359581, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6302, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 8.03125, |
|
"eval_loss": 0.6809322834014893, |
|
"eval_runtime": 52.7641, |
|
"eval_samples_per_second": 3.79, |
|
"eval_steps_per_second": 0.474, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 8.0625, |
|
"grad_norm": 0.7603743206060642, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6426, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 8.0625, |
|
"eval_loss": 0.683021068572998, |
|
"eval_runtime": 43.8381, |
|
"eval_samples_per_second": 4.562, |
|
"eval_steps_per_second": 0.57, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 8.09375, |
|
"grad_norm": 0.7751516747488628, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6734, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 8.09375, |
|
"eval_loss": 0.685730516910553, |
|
"eval_runtime": 43.9143, |
|
"eval_samples_per_second": 4.554, |
|
"eval_steps_per_second": 0.569, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 8.125, |
|
"grad_norm": 0.8783715889493854, |
|
"learning_rate": 2e-05, |
|
"loss": 0.685, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 8.125, |
|
"eval_loss": 0.6876766085624695, |
|
"eval_runtime": 43.8107, |
|
"eval_samples_per_second": 4.565, |
|
"eval_steps_per_second": 0.571, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 8.15625, |
|
"grad_norm": 0.8683763894470441, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6111, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 8.15625, |
|
"eval_loss": 0.6892675757408142, |
|
"eval_runtime": 45.4312, |
|
"eval_samples_per_second": 4.402, |
|
"eval_steps_per_second": 0.55, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 8.1875, |
|
"grad_norm": 0.83301264234889, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7238, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 8.1875, |
|
"eval_loss": 0.6900019645690918, |
|
"eval_runtime": 43.7899, |
|
"eval_samples_per_second": 4.567, |
|
"eval_steps_per_second": 0.571, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 8.21875, |
|
"grad_norm": 0.9311076945185538, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5936, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 8.21875, |
|
"eval_loss": 0.6899961233139038, |
|
"eval_runtime": 45.0746, |
|
"eval_samples_per_second": 4.437, |
|
"eval_steps_per_second": 0.555, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 8.25, |
|
"grad_norm": 0.8715436312553682, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6483, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 8.25, |
|
"eval_loss": 0.690051257610321, |
|
"eval_runtime": 43.9844, |
|
"eval_samples_per_second": 4.547, |
|
"eval_steps_per_second": 0.568, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 8.28125, |
|
"grad_norm": 0.9923902289464986, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6718, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 8.28125, |
|
"eval_loss": 0.688658595085144, |
|
"eval_runtime": 43.8005, |
|
"eval_samples_per_second": 4.566, |
|
"eval_steps_per_second": 0.571, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 8.3125, |
|
"grad_norm": 0.8485704756867186, |
|
"learning_rate": 2e-05, |
|
"loss": 0.663, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 8.3125, |
|
"eval_loss": 0.6868423223495483, |
|
"eval_runtime": 46.8136, |
|
"eval_samples_per_second": 4.272, |
|
"eval_steps_per_second": 0.534, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 8.34375, |
|
"grad_norm": 0.8355813738463048, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5884, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 8.34375, |
|
"eval_loss": 0.6864896416664124, |
|
"eval_runtime": 46.0477, |
|
"eval_samples_per_second": 4.343, |
|
"eval_steps_per_second": 0.543, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 8.375, |
|
"grad_norm": 0.8932260711586627, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6466, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 8.375, |
|
"eval_loss": 0.6860455274581909, |
|
"eval_runtime": 46.3159, |
|
"eval_samples_per_second": 4.318, |
|
"eval_steps_per_second": 0.54, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 8.40625, |
|
"grad_norm": 0.8536230233577757, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6364, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 8.40625, |
|
"eval_loss": 0.6861154437065125, |
|
"eval_runtime": 45.4048, |
|
"eval_samples_per_second": 4.405, |
|
"eval_steps_per_second": 0.551, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 8.4375, |
|
"grad_norm": 0.83328335532683, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6419, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 8.4375, |
|
"eval_loss": 0.6856899261474609, |
|
"eval_runtime": 46.609, |
|
"eval_samples_per_second": 4.291, |
|
"eval_steps_per_second": 0.536, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 8.46875, |
|
"grad_norm": 0.8841406022945117, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5383, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 8.46875, |
|
"eval_loss": 0.6865776181221008, |
|
"eval_runtime": 47.0757, |
|
"eval_samples_per_second": 4.248, |
|
"eval_steps_per_second": 0.531, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 8.5, |
|
"grad_norm": 0.8194392324450703, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6376, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 8.5, |
|
"eval_loss": 0.6892414689064026, |
|
"eval_runtime": 46.8669, |
|
"eval_samples_per_second": 4.267, |
|
"eval_steps_per_second": 0.533, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 8.53125, |
|
"grad_norm": 0.937948691760343, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6485, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 8.53125, |
|
"eval_loss": 0.6890290975570679, |
|
"eval_runtime": 46.649, |
|
"eval_samples_per_second": 4.287, |
|
"eval_steps_per_second": 0.536, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 8.5625, |
|
"grad_norm": 0.9240471094453983, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6387, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 8.5625, |
|
"eval_loss": 0.6875545382499695, |
|
"eval_runtime": 48.2193, |
|
"eval_samples_per_second": 4.148, |
|
"eval_steps_per_second": 0.518, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 8.59375, |
|
"grad_norm": 0.9186571178066892, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6503, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 8.59375, |
|
"eval_loss": 0.6848871111869812, |
|
"eval_runtime": 46.9651, |
|
"eval_samples_per_second": 4.258, |
|
"eval_steps_per_second": 0.532, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 8.625, |
|
"grad_norm": 0.9603067514462874, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6429, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 8.625, |
|
"eval_loss": 0.68189537525177, |
|
"eval_runtime": 47.959, |
|
"eval_samples_per_second": 4.17, |
|
"eval_steps_per_second": 0.521, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 8.65625, |
|
"grad_norm": 0.8632677172122276, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5888, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 8.65625, |
|
"eval_loss": 0.6817250847816467, |
|
"eval_runtime": 47.5519, |
|
"eval_samples_per_second": 4.206, |
|
"eval_steps_per_second": 0.526, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 8.6875, |
|
"grad_norm": 0.9096699999767647, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6434, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 8.6875, |
|
"eval_loss": 0.6826667785644531, |
|
"eval_runtime": 48.058, |
|
"eval_samples_per_second": 4.162, |
|
"eval_steps_per_second": 0.52, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 8.71875, |
|
"grad_norm": 0.8315455850502919, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6012, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 8.71875, |
|
"eval_loss": 0.6839814782142639, |
|
"eval_runtime": 48.1576, |
|
"eval_samples_per_second": 4.153, |
|
"eval_steps_per_second": 0.519, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 8.75, |
|
"grad_norm": 0.9058679893646637, |
|
"learning_rate": 2e-05, |
|
"loss": 0.676, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 8.75, |
|
"eval_loss": 0.6849075555801392, |
|
"eval_runtime": 47.9952, |
|
"eval_samples_per_second": 4.167, |
|
"eval_steps_per_second": 0.521, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 8.78125, |
|
"grad_norm": 0.8626848465032242, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6137, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 8.78125, |
|
"eval_loss": 0.6846147775650024, |
|
"eval_runtime": 50.2338, |
|
"eval_samples_per_second": 3.981, |
|
"eval_steps_per_second": 0.498, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 8.8125, |
|
"grad_norm": 0.8473178170336938, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6017, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 8.8125, |
|
"eval_loss": 0.6846247911453247, |
|
"eval_runtime": 49.6161, |
|
"eval_samples_per_second": 4.031, |
|
"eval_steps_per_second": 0.504, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 8.84375, |
|
"grad_norm": 0.8161205540198673, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5811, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 8.84375, |
|
"eval_loss": 0.6851673126220703, |
|
"eval_runtime": 48.2057, |
|
"eval_samples_per_second": 4.149, |
|
"eval_steps_per_second": 0.519, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 8.875, |
|
"grad_norm": 0.8854404259280148, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5459, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 8.875, |
|
"eval_loss": 0.685972273349762, |
|
"eval_runtime": 49.0992, |
|
"eval_samples_per_second": 4.073, |
|
"eval_steps_per_second": 0.509, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 8.90625, |
|
"grad_norm": 0.9439945965022273, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5908, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 8.90625, |
|
"eval_loss": 0.6852046847343445, |
|
"eval_runtime": 48.1612, |
|
"eval_samples_per_second": 4.153, |
|
"eval_steps_per_second": 0.519, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 8.9375, |
|
"grad_norm": 1.0054677849137328, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7215, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 8.9375, |
|
"eval_loss": 0.6840152144432068, |
|
"eval_runtime": 48.2329, |
|
"eval_samples_per_second": 4.147, |
|
"eval_steps_per_second": 0.518, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 8.96875, |
|
"grad_norm": 0.8657465123021779, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6479, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 8.96875, |
|
"eval_loss": 0.6845163106918335, |
|
"eval_runtime": 47.9574, |
|
"eval_samples_per_second": 4.17, |
|
"eval_steps_per_second": 0.521, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"grad_norm": 0.9781677785178013, |
|
"learning_rate": 2e-05, |
|
"loss": 0.598, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 0.6835929751396179, |
|
"eval_runtime": 48.3854, |
|
"eval_samples_per_second": 4.133, |
|
"eval_steps_per_second": 0.517, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 9.03125, |
|
"grad_norm": 0.8913448503162013, |
|
"learning_rate": 2e-05, |
|
"loss": 0.608, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 9.03125, |
|
"eval_loss": 0.682920515537262, |
|
"eval_runtime": 48.0787, |
|
"eval_samples_per_second": 4.16, |
|
"eval_steps_per_second": 0.52, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 9.0625, |
|
"grad_norm": 0.8910028425785708, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6249, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 9.0625, |
|
"eval_loss": 0.6842910647392273, |
|
"eval_runtime": 45.3447, |
|
"eval_samples_per_second": 4.411, |
|
"eval_steps_per_second": 0.551, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 9.09375, |
|
"grad_norm": 0.8766964747132081, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6198, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 9.09375, |
|
"eval_loss": 0.6897236704826355, |
|
"eval_runtime": 44.1159, |
|
"eval_samples_per_second": 4.534, |
|
"eval_steps_per_second": 0.567, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 9.125, |
|
"grad_norm": 1.0295884589810356, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5993, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 9.125, |
|
"eval_loss": 0.6943468451499939, |
|
"eval_runtime": 43.8108, |
|
"eval_samples_per_second": 4.565, |
|
"eval_steps_per_second": 0.571, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 9.15625, |
|
"grad_norm": 0.9773325211255739, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6508, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 9.15625, |
|
"eval_loss": 0.6970213055610657, |
|
"eval_runtime": 45.2879, |
|
"eval_samples_per_second": 4.416, |
|
"eval_steps_per_second": 0.552, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 9.1875, |
|
"grad_norm": 0.8891126608483751, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5919, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 9.1875, |
|
"eval_loss": 0.6991220116615295, |
|
"eval_runtime": 45.4682, |
|
"eval_samples_per_second": 4.399, |
|
"eval_steps_per_second": 0.55, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 9.21875, |
|
"grad_norm": 1.0482454581695644, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5355, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 9.21875, |
|
"eval_loss": 0.704166054725647, |
|
"eval_runtime": 45.109, |
|
"eval_samples_per_second": 4.434, |
|
"eval_steps_per_second": 0.554, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 9.25, |
|
"grad_norm": 0.9935665009180418, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5624, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 9.25, |
|
"eval_loss": 0.7078476548194885, |
|
"eval_runtime": 43.6811, |
|
"eval_samples_per_second": 4.579, |
|
"eval_steps_per_second": 0.572, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 9.28125, |
|
"grad_norm": 1.1040486086703822, |
|
"learning_rate": 2e-05, |
|
"loss": 0.66, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 9.28125, |
|
"eval_loss": 0.7050178647041321, |
|
"eval_runtime": 43.9806, |
|
"eval_samples_per_second": 4.547, |
|
"eval_steps_per_second": 0.568, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 9.3125, |
|
"grad_norm": 1.2781656869693958, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5966, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 9.3125, |
|
"eval_loss": 0.6992971897125244, |
|
"eval_runtime": 45.6581, |
|
"eval_samples_per_second": 4.38, |
|
"eval_steps_per_second": 0.548, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 9.34375, |
|
"grad_norm": 1.0619252838389437, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5724, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 9.34375, |
|
"eval_loss": 0.6947219967842102, |
|
"eval_runtime": 45.5657, |
|
"eval_samples_per_second": 4.389, |
|
"eval_steps_per_second": 0.549, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 9.375, |
|
"grad_norm": 0.9267592917491817, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5834, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 9.375, |
|
"eval_loss": 0.6934340000152588, |
|
"eval_runtime": 43.7418, |
|
"eval_samples_per_second": 4.572, |
|
"eval_steps_per_second": 0.572, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 9.40625, |
|
"grad_norm": 0.9597103067245094, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5645, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 9.40625, |
|
"eval_loss": 0.6928582787513733, |
|
"eval_runtime": 45.6592, |
|
"eval_samples_per_second": 4.38, |
|
"eval_steps_per_second": 0.548, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 9.4375, |
|
"grad_norm": 1.0528189035992561, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6196, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 9.4375, |
|
"eval_loss": 0.6888896822929382, |
|
"eval_runtime": 44.9727, |
|
"eval_samples_per_second": 4.447, |
|
"eval_steps_per_second": 0.556, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 9.46875, |
|
"grad_norm": 1.0053722794735602, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6154, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 9.46875, |
|
"eval_loss": 0.6855815052986145, |
|
"eval_runtime": 44.7585, |
|
"eval_samples_per_second": 4.468, |
|
"eval_steps_per_second": 0.559, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 9.5, |
|
"grad_norm": 0.8783611726661886, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6542, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 9.5, |
|
"eval_loss": 0.685936689376831, |
|
"eval_runtime": 44.7918, |
|
"eval_samples_per_second": 4.465, |
|
"eval_steps_per_second": 0.558, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 9.53125, |
|
"grad_norm": 0.9143611061568578, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6178, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 9.53125, |
|
"eval_loss": 0.6888444423675537, |
|
"eval_runtime": 46.8021, |
|
"eval_samples_per_second": 4.273, |
|
"eval_steps_per_second": 0.534, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 9.5625, |
|
"grad_norm": 1.0642585786595127, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6078, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 9.5625, |
|
"eval_loss": 0.6898679137229919, |
|
"eval_runtime": 47.6538, |
|
"eval_samples_per_second": 4.197, |
|
"eval_steps_per_second": 0.525, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 9.59375, |
|
"grad_norm": 1.1048937808634194, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6019, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 9.59375, |
|
"eval_loss": 0.6891123056411743, |
|
"eval_runtime": 45.7695, |
|
"eval_samples_per_second": 4.37, |
|
"eval_steps_per_second": 0.546, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 9.625, |
|
"grad_norm": 1.0058213310083948, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6406, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 9.625, |
|
"eval_loss": 0.6902400851249695, |
|
"eval_runtime": 45.7897, |
|
"eval_samples_per_second": 4.368, |
|
"eval_steps_per_second": 0.546, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 9.65625, |
|
"grad_norm": 0.9344450130195062, |
|
"learning_rate": 2e-05, |
|
"loss": 0.607, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 9.65625, |
|
"eval_loss": 0.6951236128807068, |
|
"eval_runtime": 46.8406, |
|
"eval_samples_per_second": 4.27, |
|
"eval_steps_per_second": 0.534, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 9.6875, |
|
"grad_norm": 1.1997135893441022, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5994, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 9.6875, |
|
"eval_loss": 0.6978768706321716, |
|
"eval_runtime": 47.5626, |
|
"eval_samples_per_second": 4.205, |
|
"eval_steps_per_second": 0.526, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 9.71875, |
|
"grad_norm": 1.0755945446749937, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5265, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 9.71875, |
|
"eval_loss": 0.70021653175354, |
|
"eval_runtime": 46.1678, |
|
"eval_samples_per_second": 4.332, |
|
"eval_steps_per_second": 0.542, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 9.75, |
|
"grad_norm": 1.069679239983948, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6212, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 9.75, |
|
"eval_loss": 0.7008029222488403, |
|
"eval_runtime": 47.797, |
|
"eval_samples_per_second": 4.184, |
|
"eval_steps_per_second": 0.523, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 9.78125, |
|
"grad_norm": 0.9717104499586322, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6063, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 9.78125, |
|
"eval_loss": 0.7000299096107483, |
|
"eval_runtime": 46.9892, |
|
"eval_samples_per_second": 4.256, |
|
"eval_steps_per_second": 0.532, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 9.8125, |
|
"grad_norm": 1.117536796971012, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5875, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 9.8125, |
|
"eval_loss": 0.6982808709144592, |
|
"eval_runtime": 48.0867, |
|
"eval_samples_per_second": 4.159, |
|
"eval_steps_per_second": 0.52, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 9.84375, |
|
"grad_norm": 0.987633836102932, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6072, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 9.84375, |
|
"eval_loss": 0.6959852576255798, |
|
"eval_runtime": 46.1188, |
|
"eval_samples_per_second": 4.337, |
|
"eval_steps_per_second": 0.542, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 9.875, |
|
"grad_norm": 0.972220541559008, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5984, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 9.875, |
|
"eval_loss": 0.6931790113449097, |
|
"eval_runtime": 46.363, |
|
"eval_samples_per_second": 4.314, |
|
"eval_steps_per_second": 0.539, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 9.90625, |
|
"grad_norm": 1.073192480739423, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5686, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 9.90625, |
|
"eval_loss": 0.6896910071372986, |
|
"eval_runtime": 46.2139, |
|
"eval_samples_per_second": 4.328, |
|
"eval_steps_per_second": 0.541, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 9.9375, |
|
"grad_norm": 1.0275060141171612, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5825, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 9.9375, |
|
"eval_loss": 0.6866476535797119, |
|
"eval_runtime": 47.6084, |
|
"eval_samples_per_second": 4.201, |
|
"eval_steps_per_second": 0.525, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 9.96875, |
|
"grad_norm": 1.1137122139905515, |
|
"learning_rate": 2e-05, |
|
"loss": 0.614, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 9.96875, |
|
"eval_loss": 0.6832907199859619, |
|
"eval_runtime": 48.0271, |
|
"eval_samples_per_second": 4.164, |
|
"eval_steps_per_second": 0.521, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 1.0329542238815055, |
|
"learning_rate": 2e-05, |
|
"loss": 0.569, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 0.6833243370056152, |
|
"eval_runtime": 46.9821, |
|
"eval_samples_per_second": 4.257, |
|
"eval_steps_per_second": 0.532, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 320, |
|
"total_flos": 414702785134592.0, |
|
"train_loss": 0.12324189562350511, |
|
"train_runtime": 3831.9747, |
|
"train_samples_per_second": 2.61, |
|
"train_steps_per_second": 0.084 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 320, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 5, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 414702785134592.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|